From 2974b049e78747481296a07754652be73bfefe7a Mon Sep 17 00:00:00 2001
From: Tomoaki Teshima <tomoaki.teshima@gmail.com>
Date: Tue, 26 Jul 2016 08:58:34 +0900
Subject: [PATCH] cudev: add feature to convert FP32(float) from/to FP16(half)
 on GPU   * add feature of Fp16 on GPU (cudev)   * add test   * leave template
 function as unimplemented to raise error

---
 modules/core/include/opencv2/core/cuda.hpp    |  2 +
 modules/core/src/cuda/gpu_mat.cu              | 43 ++++++++++++
 .../opencv2/cudev/functional/functional.hpp   | 21 ++++++
 .../opencv2/cudev/util/saturate_cast.hpp      | 11 +++
 modules/cudev/test/test_cvt.cu                | 67 +++++++++++++++++++
 5 files changed, 144 insertions(+)
diff --git a/modules/core/include/opencv2/core/cuda.hpp b/modules/core/include/opencv2/core/cuda.hpp
index 96685513b1..0f7bf01c73 100644
--- a/modules/core/include/opencv2/core/cuda.hpp
+++ b/modules/core/include/opencv2/core/cuda.hpp
@@ -855,6 +855,8 @@ private:
 CV_EXPORTS void printCudaDeviceInfo(int device);
 CV_EXPORTS void printShortCudaDeviceInfo(int device);
 
+CV_EXPORTS void convertFp16Cuda(InputArray _src, OutputArray _dst, Stream& stream = Stream::Null());
+
 //! @} cudacore_init
 
 }} // namespace cv { namespace cuda {
diff --git a/modules/core/src/cuda/gpu_mat.cu b/modules/core/src/cuda/gpu_mat.cu
index a558349703..a772e43abc 100644
--- a/modules/core/src/cuda/gpu_mat.cu
+++ b/modules/core/src/cuda/gpu_mat.cu
@@ -510,6 +510,17 @@ namespace
 
         gridTransformUnary_< ConvertToPolicy<scalar_type> >(globPtr<T>(src), globPtr<D>(dst), op, stream);
     }
+
+    template <typename T, typename D>
+    void convertScaleHalf(const GpuMat& src, const GpuMat& dst, Stream& stream)
+    {
+        typedef typename VecTraits<T>::elem_type src_elem_type;
+        typedef typename VecTraits<D>::elem_type dst_elem_type;
+        typedef typename LargerType<src_elem_type, float>::type larger_elem_type;
+        typedef typename LargerType<float, dst_elem_type>::type scalar_type;
+
+        gridTransformUnary_< ConvertToPolicy<scalar_type> >(globPtr<T>(src), globPtr<D>(dst), saturate_cast_fp16_func<T,D>(), stream);
+    }
 }
 
 void cv::cuda::GpuMat::convertTo(OutputArray _dst, int rtype, Stream& stream) const
@@ -583,4 +594,36 @@ void cv::cuda::GpuMat::convertTo(OutputArray _dst, int rtype, double alpha, doub
     funcs[sdepth][ddepth](reshape(1), dst.reshape(1), alpha, beta, stream);
 }
 
+void cv::cuda::convertFp16Cuda(InputArray _src, OutputArray _dst, Stream& stream)
+{
+    GpuMat src = _src.getGpuMat();
+    int ddepth = 0;
+
+    switch(src.depth())
+    {
+    case CV_32F:
+        ddepth = CV_16S;
+        break;
+    case CV_16S:
+        ddepth = CV_32F;
+        break;
+    default:
+        CV_Error(Error::StsUnsupportedFormat, "Unsupported input depth");
+        return;
+    }
+    int type = CV_MAKE_TYPE(CV_MAT_DEPTH(ddepth), src.channels());
+    _dst.create(src.size(), type);
+    GpuMat dst = _dst.getGpuMat();
+
+    typedef void (*func_t)(const GpuMat& src, const GpuMat& dst, Stream& stream);
+    static const func_t funcs[] =
+    {
+        0, 0, 0,
+        convertScaleHalf<float, short>, 0, convertScaleHalf<short, float>,
+        0, 0,
+    };
+
+    funcs[ddepth](src.reshape(1), dst.reshape(1), stream);
+}
+
 #endif
diff --git a/modules/cudev/include/opencv2/cudev/functional/functional.hpp b/modules/cudev/include/opencv2/cudev/functional/functional.hpp
index 125b66f07a..ea09c0b322 100644
--- a/modules/cudev/include/opencv2/cudev/functional/functional.hpp
+++ b/modules/cudev/include/opencv2/cudev/functional/functional.hpp
@@ -668,6 +668,27 @@ template <typename T, typename D> struct saturate_cast_func : unary_function<T,
     }
 };
 
+// Convert Fp16 dummy
+template <typename T, typename D> struct saturate_cast_fp16_func;
+
+// Convert Fp16 from Fp32
+template <> struct saturate_cast_fp16_func<float, short> : unary_function<float, short>
+{
+    __device__ __forceinline__ short operator ()(float v) const
+    {
+        return cast_fp16<float, short>(v);
+    }
+};
+
+// Convert Fp16 to Fp32
+template <> struct saturate_cast_fp16_func<short, float> : unary_function<short, float>
+{
+    __device__ __forceinline__ float operator ()(short v) const
+    {
+        return cast_fp16<short, float>(v);
+    }
+};
+
 // Threshold Functors
 
 template <typename T> struct ThreshBinaryFunc : unary_function<T, T>
diff --git a/modules/cudev/include/opencv2/cudev/util/saturate_cast.hpp b/modules/cudev/include/opencv2/cudev/util/saturate_cast.hpp
index 3176542d2c..e39550be85 100644
--- a/modules/cudev/include/opencv2/cudev/util/saturate_cast.hpp
+++ b/modules/cudev/include/opencv2/cudev/util/saturate_cast.hpp
@@ -270,6 +270,17 @@ template <> __device__ __forceinline__ uint saturate_cast<uint>(double v)
 #endif
 }
 
+template <typename T, typename D> __device__ __forceinline__ D cast_fp16(T v);
+
+template <> __device__ __forceinline__ float cast_fp16<short, float>(short v)
+{
+    return __half2float(v);
+}
+
+template <> __device__ __forceinline__ short cast_fp16<float, short>(float v)
+{
+    return (short)__float2half_rn(v);
+}
 //! @}
 
 }}
diff --git a/modules/cudev/test/test_cvt.cu b/modules/cudev/test/test_cvt.cu
index b1c3d10f66..4e11b6319c 100644
--- a/modules/cudev/test/test_cvt.cu
+++ b/modules/cudev/test/test_cvt.cu
@@ -49,6 +49,7 @@ using namespace cv::cudev;
 using namespace cvtest;
 
 typedef ::testing::Types<uchar, ushort, short, int, float> AllTypes;
+typedef ::testing::Types<short, float> Fp16Types;
 
 ////////////////////////////////////////////////////////////////////////////////
 // CvtTest
@@ -75,9 +76,75 @@ public:
     }
 };
 
+// dummy class
+template <typename T>
+class CvFp16Test : public ::testing::Test
+{
+public:
+    void test_gpumat()
+    {
+    }
+};
+
+template <>
+class CvFp16Test <short> : public ::testing::Test
+{
+public:
+    void test_gpumat()
+    {
+        const Size size = randomSize(100, 400);
+        const int type = DataType<float>::type;
+
+        Mat src = randomMat(size, type), dst, ref;
+
+        GpuMat_<float> g_src(src);
+        GpuMat g_dst;
+
+        // Fp32 -> Fp16
+        convertFp16Cuda(g_src, g_dst);
+        convertFp16Cuda(g_dst.clone(), g_dst);
+        // Fp16 -> Fp32
+        convertFp16(src, dst);
+        convertFp16(dst, ref);
+
+        g_dst.download(dst);
+        EXPECT_MAT_NEAR(dst, ref, 0.0);
+    }
+};
+
+template <>
+class CvFp16Test <float> : public ::testing::Test
+{
+public:
+    void test_gpumat()
+    {
+        const Size size = randomSize(100, 400);
+        const int type = DataType<float>::type;
+
+        Mat src = randomMat(size, type), dst, ref;
+
+        GpuMat_<float> g_src(src);
+        GpuMat g_dst;
+
+        // Fp32 -> Fp16
+        convertFp16Cuda(g_src, g_dst);
+        convertFp16(src, ref);
+
+        g_dst.download(dst);
+        EXPECT_MAT_NEAR(dst, ref, 0.0);
+    }
+};
+
 TYPED_TEST_CASE(CvtTest, AllTypes);
 
 TYPED_TEST(CvtTest, GpuMat)
 {
     CvtTest<TypeParam>::test_gpumat();
 }
+
+TYPED_TEST_CASE(CvFp16Test, Fp16Types);
+
+TYPED_TEST(CvFp16Test, GpuMat)
+{
+    CvFp16Test<TypeParam>::test_gpumat();
+}