From 2974b049e78747481296a07754652be73bfefe7a Mon Sep 17 00:00:00 2001 From: Tomoaki Teshima Date: Tue, 26 Jul 2016 08:58:34 +0900 Subject: [PATCH] cudev: add feature to convert FP32(float) from/to FP16(half) on GPU * add feature of Fp16 on GPU (cudev) * add test * leave template function as unimplemented to raise error --- modules/core/include/opencv2/core/cuda.hpp | 2 + modules/core/src/cuda/gpu_mat.cu | 43 ++++++++++++ .../opencv2/cudev/functional/functional.hpp | 21 ++++++ .../opencv2/cudev/util/saturate_cast.hpp | 11 +++ modules/cudev/test/test_cvt.cu | 67 +++++++++++++++++++ 5 files changed, 144 insertions(+) diff --git a/modules/core/include/opencv2/core/cuda.hpp b/modules/core/include/opencv2/core/cuda.hpp index 96685513b1..0f7bf01c73 100644 --- a/modules/core/include/opencv2/core/cuda.hpp +++ b/modules/core/include/opencv2/core/cuda.hpp @@ -855,6 +855,8 @@ private: CV_EXPORTS void printCudaDeviceInfo(int device); CV_EXPORTS void printShortCudaDeviceInfo(int device); +CV_EXPORTS void convertFp16Cuda(InputArray _src, OutputArray _dst, Stream& stream = Stream::Null()); + //! @} cudacore_init }} // namespace cv { namespace cuda { diff --git a/modules/core/src/cuda/gpu_mat.cu b/modules/core/src/cuda/gpu_mat.cu index a558349703..a772e43abc 100644 --- a/modules/core/src/cuda/gpu_mat.cu +++ b/modules/core/src/cuda/gpu_mat.cu @@ -510,6 +510,17 @@ namespace gridTransformUnary_< ConvertToPolicy >(globPtr(src), globPtr(dst), op, stream); } + + template + void convertScaleHalf(const GpuMat& src, const GpuMat& dst, Stream& stream) + { + typedef typename VecTraits::elem_type src_elem_type; + typedef typename VecTraits::elem_type dst_elem_type; + typedef typename LargerType::type larger_elem_type; + typedef typename LargerType::type scalar_type; + + gridTransformUnary_< ConvertToPolicy >(globPtr(src), globPtr(dst), saturate_cast_fp16_func(), stream); + } } void cv::cuda::GpuMat::convertTo(OutputArray _dst, int rtype, Stream& stream) const @@ -583,4 +594,36 @@ void cv::cuda::GpuMat::convertTo(OutputArray _dst, int rtype, double alpha, doub funcs[sdepth][ddepth](reshape(1), dst.reshape(1), alpha, beta, stream); } +void cv::cuda::convertFp16Cuda(InputArray _src, OutputArray _dst, Stream& stream) +{ + GpuMat src = _src.getGpuMat(); + int ddepth = 0; + + switch(src.depth()) + { + case CV_32F: + ddepth = CV_16S; + break; + case CV_16S: + ddepth = CV_32F; + break; + default: + CV_Error(Error::StsUnsupportedFormat, "Unsupported input depth"); + return; + } + int type = CV_MAKE_TYPE(CV_MAT_DEPTH(ddepth), src.channels()); + _dst.create(src.size(), type); + GpuMat dst = _dst.getGpuMat(); + + typedef void (*func_t)(const GpuMat& src, const GpuMat& dst, Stream& stream); + static const func_t funcs[] = + { + 0, 0, 0, + convertScaleHalf, 0, convertScaleHalf, + 0, 0, + }; + + funcs[ddepth](src.reshape(1), dst.reshape(1), stream); +} + #endif diff --git a/modules/cudev/include/opencv2/cudev/functional/functional.hpp b/modules/cudev/include/opencv2/cudev/functional/functional.hpp index 125b66f07a..ea09c0b322 100644 --- a/modules/cudev/include/opencv2/cudev/functional/functional.hpp +++ b/modules/cudev/include/opencv2/cudev/functional/functional.hpp @@ -668,6 +668,27 @@ template struct saturate_cast_func : unary_function struct saturate_cast_fp16_func; + +// Convert Fp16 from Fp32 +template <> struct saturate_cast_fp16_func : unary_function +{ + __device__ __forceinline__ short operator ()(float v) const + { + return cast_fp16(v); + } +}; + +// Convert Fp16 to Fp32 +template <> struct saturate_cast_fp16_func : unary_function +{ + __device__ __forceinline__ float operator ()(short v) const + { + return cast_fp16(v); + } +}; + // Threshold Functors template struct ThreshBinaryFunc : unary_function diff --git a/modules/cudev/include/opencv2/cudev/util/saturate_cast.hpp b/modules/cudev/include/opencv2/cudev/util/saturate_cast.hpp index 3176542d2c..e39550be85 100644 --- a/modules/cudev/include/opencv2/cudev/util/saturate_cast.hpp +++ b/modules/cudev/include/opencv2/cudev/util/saturate_cast.hpp @@ -270,6 +270,17 @@ template <> __device__ __forceinline__ uint saturate_cast(double v) #endif } +template __device__ __forceinline__ D cast_fp16(T v); + +template <> __device__ __forceinline__ float cast_fp16(short v) +{ + return __half2float(v); +} + +template <> __device__ __forceinline__ short cast_fp16(float v) +{ + return (short)__float2half_rn(v); +} //! @} }} diff --git a/modules/cudev/test/test_cvt.cu b/modules/cudev/test/test_cvt.cu index b1c3d10f66..4e11b6319c 100644 --- a/modules/cudev/test/test_cvt.cu +++ b/modules/cudev/test/test_cvt.cu @@ -49,6 +49,7 @@ using namespace cv::cudev; using namespace cvtest; typedef ::testing::Types AllTypes; +typedef ::testing::Types Fp16Types; //////////////////////////////////////////////////////////////////////////////// // CvtTest @@ -75,9 +76,75 @@ public: } }; +// dummy class +template +class CvFp16Test : public ::testing::Test +{ +public: + void test_gpumat() + { + } +}; + +template <> +class CvFp16Test : public ::testing::Test +{ +public: + void test_gpumat() + { + const Size size = randomSize(100, 400); + const int type = DataType::type; + + Mat src = randomMat(size, type), dst, ref; + + GpuMat_ g_src(src); + GpuMat g_dst; + + // Fp32 -> Fp16 + convertFp16Cuda(g_src, g_dst); + convertFp16Cuda(g_dst.clone(), g_dst); + // Fp16 -> Fp32 + convertFp16(src, dst); + convertFp16(dst, ref); + + g_dst.download(dst); + EXPECT_MAT_NEAR(dst, ref, 0.0); + } +}; + +template <> +class CvFp16Test : public ::testing::Test +{ +public: + void test_gpumat() + { + const Size size = randomSize(100, 400); + const int type = DataType::type; + + Mat src = randomMat(size, type), dst, ref; + + GpuMat_ g_src(src); + GpuMat g_dst; + + // Fp32 -> Fp16 + convertFp16Cuda(g_src, g_dst); + convertFp16(src, ref); + + g_dst.download(dst); + EXPECT_MAT_NEAR(dst, ref, 0.0); + } +}; + TYPED_TEST_CASE(CvtTest, AllTypes); TYPED_TEST(CvtTest, GpuMat) { CvtTest::test_gpumat(); } + +TYPED_TEST_CASE(CvFp16Test, Fp16Types); + +TYPED_TEST(CvFp16Test, GpuMat) +{ + CvFp16Test::test_gpumat(); +}