From b2135be5942c98cd638f3cef4845672573668d73 Mon Sep 17 00:00:00 2001 From: "Paul E. Murphy" Date: Wed, 24 Jul 2019 14:12:40 -0500 Subject: [PATCH 1/3] fast_math: add extra perf/unit tests Add a basic sanity test to verify the rounding functions work as expected. Likewise, extend the rounding performance test to cover the additional float -> int fast math functions. --- modules/core/perf/perf_cvround.cpp | 78 +++++++++++++++++------------- modules/core/test/test_math.cpp | 54 +++++++++++++++++++++ 2 files changed, 98 insertions(+), 34 deletions(-) diff --git a/modules/core/perf/perf_cvround.cpp b/modules/core/perf/perf_cvround.cpp index 933792dcaa..0e3ceb0597 100644 --- a/modules/core/perf/perf_cvround.cpp +++ b/modules/core/perf/perf_cvround.cpp @@ -4,42 +4,52 @@ namespace opencv_test { using namespace perf; -template -static void CvRoundMat(const cv::Mat & src, cv::Mat & dst) -{ - for (int y = 0; y < dst.rows; ++y) - { - const T * sptr = src.ptr(y); - int * dptr = dst.ptr(y); - - for (int x = 0; x < dst.cols; ++x) - dptr[x] = cvRound(sptr[x]); +#define DECL_ROUND_TEST(NAME, OP, EXTRA) \ + template \ + static void OP ## Mat(const cv::Mat & src, cv::Mat & dst) \ + { \ + for (int y = 0; y < dst.rows; ++y) \ + { \ + const T * sptr = src.ptr(y); \ + int * dptr = dst.ptr(y); \ + \ + for (int x = 0; x < dst.cols; ++x) \ + dptr[x] = OP(sptr[x]) EXTRA; \ + } \ + } \ + \ + PERF_TEST_P(Size_MatType, CvRound_Float ## NAME, \ + testing::Combine(testing::Values(TYPICAL_MAT_SIZES), \ + testing::Values(CV_32FC1, CV_64FC1))) \ + { \ + Size size = get<0>(GetParam()); \ + int type = get<1>(GetParam()), depth = CV_MAT_DEPTH(type); \ + \ + cv::Mat src(size, type), dst(size, CV_32SC1); \ + \ + declare.in(src, WARMUP_RNG).out(dst); \ + \ + if (depth == CV_32F) \ + { \ + TEST_CYCLE() \ + OP ## Mat(src, dst); \ + } \ + else if (depth == CV_64F) \ + { \ + TEST_CYCLE() \ + OP ## Mat(src, dst); \ + } \ + \ + SANITY_CHECK_NOTHING(); \ } -} - -PERF_TEST_P(Size_MatType, CvRound_Float, - testing::Combine(testing::Values(TYPICAL_MAT_SIZES), - testing::Values(CV_32FC1, CV_64FC1))) -{ - Size size = get<0>(GetParam()); - int type = get<1>(GetParam()), depth = CV_MAT_DEPTH(type); - cv::Mat src(size, type), dst(size, CV_32SC1); - - declare.in(src, WARMUP_RNG).out(dst); - - if (depth == CV_32F) - { - TEST_CYCLE() - CvRoundMat(src, dst); - } - else if (depth == CV_64F) - { - TEST_CYCLE() - CvRoundMat(src, dst); - } +DECL_ROUND_TEST(,cvRound,) +DECL_ROUND_TEST(_Ceil,cvCeil,) +DECL_ROUND_TEST(_Floor,cvFloor,) - SANITY_CHECK_NOTHING(); -} +/* For FP classification tests, try to test them in way which uses + branching logic and avoids extra FP logic. */ +DECL_ROUND_TEST(_NaN,cvIsNaN, ? 1 : 2) +DECL_ROUND_TEST(_Inf,cvIsInf, ? 1 : 2) } // namespace diff --git a/modules/core/test/test_math.cpp b/modules/core/test/test_math.cpp index d65b3fa39e..8b13a391cb 100644 --- a/modules/core/test/test_math.cpp +++ b/modules/core/test/test_math.cpp @@ -3923,5 +3923,59 @@ TEST(Core_SoftFloat, CvRound) } } +template +static void checkRounding(T in, int outCeil, int outFloor) +{ + EXPECT_EQ(outCeil,cvCeil(in)); + EXPECT_EQ(outFloor,cvFloor(in)); + + /* cvRound is not expected to be IEEE compliant. The implementation + should round to one of the above. */ + EXPECT_TRUE((cvRound(in) == outCeil) || (cvRound(in) == outFloor)); +} + +TEST(Core_FastMath, InlineRoundingOps) +{ + struct + { + double in; + int outCeil; + int outFloor; + } values[] = + { + // Values are chosen to convert to binary float 32/64 exactly + { 1.0, 1, 1 }, + { 1.5, 2, 1 }, + { -1.5, -1, -2} + }; + + for (int i = 0, maxi = sizeof(values) / sizeof(values[0]); i < maxi; i++) + { + checkRounding(values[i].in, values[i].outCeil, values[i].outFloor); + checkRounding((float)values[i].in, values[i].outCeil, values[i].outFloor); + } +} + +TEST(Core_FastMath, InlineNaN) +{ + EXPECT_EQ( cvIsNaN((float) NAN), 1); + EXPECT_EQ( cvIsNaN((float) -NAN), 1); + EXPECT_EQ( cvIsNaN(0.0f), 0); + EXPECT_EQ( cvIsNaN((double) NAN), 1); + EXPECT_EQ( cvIsNaN((double) -NAN), 1); + EXPECT_EQ( cvIsNaN(0.0), 0); +} + +TEST(Core_FastMath, InlineIsInf) +{ + // Assume HUGE_VAL is infinity. Strictly speaking, may not always be true. + EXPECT_EQ( cvIsInf((float) HUGE_VAL), 1); + EXPECT_EQ( cvIsInf((float) -HUGE_VAL), 1); + EXPECT_EQ( cvIsInf(0.0f), 0); + EXPECT_EQ( cvIsInf((double) HUGE_VAL), 1); + EXPECT_EQ( cvIsInf((double) -HUGE_VAL), 1); + EXPECT_EQ( cvIsInf(0.0), 0); +} + }} // namespace /* End of file. */ From 3f92bcc11ab122b6cb5167c3548a0e2ce306c94d Mon Sep 17 00:00:00 2001 From: "Paul E. Murphy" Date: Mon, 22 Jul 2019 14:23:49 -0500 Subject: [PATCH 2/3] fast_math: selectively use GCC rounding builtins when available Add a new macro definition OPENCV_USE_FASTMATH_GCC_BUILTINS to enable usage of GCC inline math functions, if available and requested by the user. Likewise, enable it for POWER. This is nearly always a substantial improvement over using integer manipulation as most operations can be done in several instructions with no branching. The result is a 1.5-1.8x speedup in the ceil/floor operations. 1. As tested with AT 12.0-1 (GCC 8.3.1) compiler on P9 LE. --- .../core/include/opencv2/core/fast_math.hpp | 29 +++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/modules/core/include/opencv2/core/fast_math.hpp b/modules/core/include/opencv2/core/fast_math.hpp index d9ea28e632..6eb6f1fa00 100644 --- a/modules/core/include/opencv2/core/fast_math.hpp +++ b/modules/core/include/opencv2/core/fast_math.hpp @@ -92,6 +92,19 @@ #define ARM_ROUND_FLT(value) ARM_ROUND(value, "vcvtr.s32.f32 %[temp], %[value]\n vmov %[res], %[temp]") #endif +#if defined __PPC64__ && !defined OPENCV_USE_FASTMATH_GCC_BUILTINS + /* Let GCC inline C math functions when available. Dedicated hardware is available to + round and covert FP values. */ + #define OPENCV_USE_FASTMATH_GCC_BUILTINS +#endif + +/* Enable GCC builtin math functions if possible, desired, and available. + Note, not all math functions inline equally. E.g lrint will not inline + without the -fno-math-errno option. */ +#if defined OPENCV_USE_FASTMATH_GCC_BUILTINS && defined __GNUC__ && !defined __clang__ && !defined (__CUDACC__) + #define _OPENCV_FASTMATH_ENABLE_GCC_MATH_BUILTINS +#endif + /** @brief Rounds floating-point number to the nearest integer @param value floating-point number. If the value is outside of INT_MIN ... INT_MAX range, the @@ -138,8 +151,12 @@ cvRound( double value ) */ CV_INLINE int cvFloor( double value ) { +#if defined _OPENCV_FASTMATH_ENABLE_GCC_MATH_BUILTINS + return __builtin_floor(value); +#else int i = (int)value; return i - (i > value); +#endif } /** @brief Rounds floating-point number to the nearest integer not smaller than the original. @@ -151,8 +168,12 @@ CV_INLINE int cvFloor( double value ) */ CV_INLINE int cvCeil( double value ) { +#if defined _OPENCV_FASTMATH_ENABLE_GCC_MATH_BUILTINS + return __builtin_ceil(value); +#else int i = (int)value; return i + (i < value); +#endif } /** @brief Determines if the argument is Not A Number. @@ -225,8 +246,12 @@ CV_INLINE int cvRound( int value ) /** @overload */ CV_INLINE int cvFloor( float value ) { +#if defined _OPENCV_FASTMATH_ENABLE_GCC_MATH_BUILTINS + return __builtin_floorf(value); +#else int i = (int)value; return i - (i > value); +#endif } /** @overload */ @@ -238,8 +263,12 @@ CV_INLINE int cvFloor( int value ) /** @overload */ CV_INLINE int cvCeil( float value ) { +#if defined _OPENCV_FASTMATH_ENABLE_GCC_MATH_BUILTINS + return __builtin_ceilf(value); +#else int i = (int)value; return i + (i < value); +#endif } /** @overload */ From f38a61c66d8e45ad0b5e21bdc6e55223d1cdb59e Mon Sep 17 00:00:00 2001 From: "Paul E. Murphy" Date: Mon, 22 Jul 2019 14:23:56 -0500 Subject: [PATCH 3/3] fast_math: implement optimized PPC routines Implement cvRound using inline asm. No compiler support exists today to properly optimize this. This results in about a 4x speedup over the default rounding. Likewise, simplify the growing number of rounding function overloads. For P9 enabled targets, utilize the classification testing instruction to test for Inf/Nan values. Operation speedup is about 1.2x for FP32, and 1.5x for FP64 operands. For P8 targets, fallback to the GCC nan inline. It provides a 1.1/1.4x improvement for FP32/FP64 arguments. --- .../core/include/opencv2/core/fast_math.hpp | 84 ++++++++++++++----- 1 file changed, 62 insertions(+), 22 deletions(-) diff --git a/modules/core/include/opencv2/core/fast_math.hpp b/modules/core/include/opencv2/core/fast_math.hpp index 6eb6f1fa00..b1e8c4202d 100644 --- a/modules/core/include/opencv2/core/fast_math.hpp +++ b/modules/core/include/opencv2/core/fast_math.hpp @@ -74,7 +74,15 @@ # include "tegra_round.hpp" #endif -#if defined __GNUC__ && defined __arm__ && (defined __ARM_PCS_VFP || defined __ARM_VFPV3__ || defined __ARM_NEON__) && !defined __SOFTFP__ && !defined(__CUDACC__) +#if defined __PPC64__ && defined __GNUC__ && defined _ARCH_PWR8 && !defined (__CUDACC__) +# include +#endif + +#if ((defined _MSC_VER && defined _M_ARM) || defined CV_ICC || \ + defined __GNUC__) && defined HAVE_TEGRA_OPTIMIZATION + #define CV_INLINE_ROUND_DBL(value) TEGRA_ROUND_DBL(value); + #define CV_INLINE_ROUND_FLT(value) TEGRA_ROUND_FLT(value); +#elif defined __GNUC__ && defined __arm__ && (defined __ARM_PCS_VFP || defined __ARM_VFPV3__ || defined __ARM_NEON__) && !defined __SOFTFP__ && !defined(__CUDACC__) // 1. general scheme #define ARM_ROUND(_value, _asm_string) \ int res; \ @@ -84,12 +92,32 @@ return res // 2. version for double #ifdef __clang__ - #define ARM_ROUND_DBL(value) ARM_ROUND(value, "vcvtr.s32.f64 %[temp], %[value] \n vmov %[res], %[temp]") + #define CV_INLINE_ROUND_DBL(value) ARM_ROUND(value, "vcvtr.s32.f64 %[temp], %[value] \n vmov %[res], %[temp]") #else - #define ARM_ROUND_DBL(value) ARM_ROUND(value, "vcvtr.s32.f64 %[temp], %P[value] \n vmov %[res], %[temp]") + #define CV_INLINE_ROUND_DBL(value) ARM_ROUND(value, "vcvtr.s32.f64 %[temp], %P[value] \n vmov %[res], %[temp]") #endif // 3. version for float - #define ARM_ROUND_FLT(value) ARM_ROUND(value, "vcvtr.s32.f32 %[temp], %[value]\n vmov %[res], %[temp]") + #define CV_INLINE_ROUND_FLT(value) ARM_ROUND(value, "vcvtr.s32.f32 %[temp], %[value]\n vmov %[res], %[temp]") +#elif defined __PPC64__ && defined __GNUC__ && defined _ARCH_PWR8 && !defined (__CUDACC__) + // P8 and newer machines can convert fp32/64 to int quickly. + #define CV_INLINE_ROUND_DBL(value) \ + int out; \ + double temp; \ + __asm__( "fctiw %[temp],%[in]\n\tmffprwz %[out],%[temp]\n\t" : [out] "=r" (out), [temp] "=d" (temp) : [in] "d" ((double)(value)) : ); \ + return out; + + // FP32 also works with FP64 routine above + #define CV_INLINE_ROUND_FLT(value) CV_INLINE_ROUND_DBL(value) + + #ifdef _ARCH_PWR9 + #define CV_INLINE_ISINF_DBL(value) return scalar_test_data_class(value, 0x30); + #define CV_INLINE_ISNAN_DBL(value) return scalar_test_data_class(value, 0x40); + #define CV_INLINE_ISINF_FLT(value) CV_INLINE_ISINF_DBL(value) + #define CV_INLINE_ISNAN_FLT(value) CV_INLINE_ISNAN_DBL(value) + #endif +#elif defined CV_ICC || defined __GNUC__ + #define CV_INLINE_ROUND_DBL(value) return (int)(lrint(value)); + #define CV_INLINE_ROUND_FLT(value) return (int)(lrintf(value)); #endif #if defined __PPC64__ && !defined OPENCV_USE_FASTMATH_GCC_BUILTINS @@ -105,6 +133,16 @@ #define _OPENCV_FASTMATH_ENABLE_GCC_MATH_BUILTINS #endif +/* Allow overrides for some functions which may benefit from tuning. Likewise, + note that isinf is not used as the return value is signed. */ +#if defined _OPENCV_FASTMATH_ENABLE_GCC_MATH_BUILTINS && !defined CV_INLINE_ISNAN_DBL + #define CV_INLINE_ISNAN_DBL(value) return __builtin_isnan(value); +#endif + +#if defined _OPENCV_FASTMATH_ENABLE_GCC_MATH_BUILTINS && !defined CV_INLINE_ISNAN_FLT + #define CV_INLINE_ISNAN_FLT(value) return __builtin_isnanf(value); +#endif + /** @brief Rounds floating-point number to the nearest integer @param value floating-point number. If the value is outside of INT_MIN ... INT_MAX range, the @@ -125,15 +163,8 @@ cvRound( double value ) fistp t; } return t; -#elif ((defined _MSC_VER && defined _M_ARM) || defined CV_ICC || \ - defined __GNUC__) && defined HAVE_TEGRA_OPTIMIZATION - TEGRA_ROUND_DBL(value); -#elif defined CV_ICC || defined __GNUC__ -# if defined ARM_ROUND_DBL - ARM_ROUND_DBL(value); -# else - return (int)lrint(value); -# endif +#elif defined CV_INLINE_ROUND_DBL + CV_INLINE_ROUND_DBL(value); #else /* it's ok if round does not comply with IEEE754 standard; the tests should allow +/-1 difference when the tested functions use round */ @@ -184,10 +215,14 @@ CV_INLINE int cvCeil( double value ) otherwise. */ CV_INLINE int cvIsNaN( double value ) { +#if defined CV_INLINE_ISNAN_DBL + CV_INLINE_ISNAN_DBL(value); +#else Cv64suf ieee754; ieee754.f = value; return ((unsigned)(ieee754.u >> 32) & 0x7fffffff) + ((unsigned)ieee754.u != 0) > 0x7ff00000; +#endif } /** @brief Determines if the argument is Infinity. @@ -198,10 +233,14 @@ CV_INLINE int cvIsNaN( double value ) and 0 otherwise. */ CV_INLINE int cvIsInf( double value ) { +#if defined CV_INLINE_ISINF_DBL + CV_INLINE_ISINF_DBL(value); +#else Cv64suf ieee754; ieee754.f = value; return ((unsigned)(ieee754.u >> 32) & 0x7fffffff) == 0x7ff00000 && (unsigned)ieee754.u == 0; +#endif } #ifdef __cplusplus @@ -221,15 +260,8 @@ CV_INLINE int cvRound(float value) fistp t; } return t; -#elif ((defined _MSC_VER && defined _M_ARM) || defined CV_ICC || \ - defined __GNUC__) && defined HAVE_TEGRA_OPTIMIZATION - TEGRA_ROUND_FLT(value); -#elif defined CV_ICC || defined __GNUC__ -# if defined ARM_ROUND_FLT - ARM_ROUND_FLT(value); -# else - return (int)lrintf(value); -# endif +#elif defined CV_INLINE_ROUND_FLT + CV_INLINE_ROUND_FLT(value); #else /* it's ok if round does not comply with IEEE754 standard; the tests should allow +/-1 difference when the tested functions use round */ @@ -280,17 +312,25 @@ CV_INLINE int cvCeil( int value ) /** @overload */ CV_INLINE int cvIsNaN( float value ) { +#if defined CV_INLINE_ISNAN_FLT + CV_INLINE_ISNAN_FLT(value); +#else Cv32suf ieee754; ieee754.f = value; return (ieee754.u & 0x7fffffff) > 0x7f800000; +#endif } /** @overload */ CV_INLINE int cvIsInf( float value ) { +#if defined CV_INLINE_ISINF_FLT + CV_INLINE_ISINF_FLT(value); +#else Cv32suf ieee754; ieee754.f = value; return (ieee754.u & 0x7fffffff) == 0x7f800000; +#endif } #endif // __cplusplus