From b2135be5942c98cd638f3cef4845672573668d73 Mon Sep 17 00:00:00 2001
From: "Paul E. Murphy" <pmur@users.noreply.github.com>
Date: Wed, 24 Jul 2019 14:12:40 -0500
Subject: [PATCH 1/3] fast_math: add extra perf/unit tests

Add a basic sanity test to verify the rounding functions
work as expected.

Likewise, extend the rounding performance test to cover the
additional float -> int fast math functions.
---
 modules/core/perf/perf_cvround.cpp | 78 +++++++++++++++++-------------
 modules/core/test/test_math.cpp    | 54 +++++++++++++++++++++
 2 files changed, 98 insertions(+), 34 deletions(-)
diff --git a/modules/core/perf/perf_cvround.cpp b/modules/core/perf/perf_cvround.cpp
index 933792dcaa..0e3ceb0597 100644
--- a/modules/core/perf/perf_cvround.cpp
+++ b/modules/core/perf/perf_cvround.cpp
@@ -4,42 +4,52 @@ namespace opencv_test
 {
 using namespace perf;
 
-template <typename T>
-static void CvRoundMat(const cv::Mat & src, cv::Mat & dst)
-{
-    for (int y = 0; y < dst.rows; ++y)
-    {
-        const T * sptr = src.ptr<T>(y);
-        int * dptr = dst.ptr<int>(y);
-
-        for (int x = 0; x < dst.cols; ++x)
-            dptr[x] = cvRound(sptr[x]);
+#define DECL_ROUND_TEST(NAME, OP, EXTRA) \
+    template <typename T>                                          \
+    static void OP ## Mat(const cv::Mat & src, cv::Mat & dst)      \
+    {                                                              \
+        for (int y = 0; y < dst.rows; ++y)                         \
+        {                                                          \
+            const T * sptr = src.ptr<T>(y);                        \
+            int * dptr = dst.ptr<int>(y);                          \
+                                                                   \
+            for (int x = 0; x < dst.cols; ++x)                     \
+                dptr[x] = OP(sptr[x]) EXTRA;                       \
+        }                                                          \
+    }                                                              \
+                                                                   \
+    PERF_TEST_P(Size_MatType, CvRound_Float ## NAME,               \
+            testing::Combine(testing::Values(TYPICAL_MAT_SIZES),   \
+                             testing::Values(CV_32FC1, CV_64FC1))) \
+    {                                                              \
+        Size size = get<0>(GetParam());                            \
+        int type = get<1>(GetParam()), depth = CV_MAT_DEPTH(type); \
+                                                                   \
+        cv::Mat src(size, type), dst(size, CV_32SC1);              \
+                                                                   \
+        declare.in(src, WARMUP_RNG).out(dst);                      \
+                                                                   \
+        if (depth == CV_32F)                                       \
+        {                                                          \
+            TEST_CYCLE()                                           \
+                OP ## Mat<float>(src, dst);                        \
+        }                                                          \
+        else if (depth == CV_64F)                                  \
+        {                                                          \
+            TEST_CYCLE()                                           \
+                OP ## Mat<double>(src, dst);                       \
+        }                                                          \
+                                                                   \
+        SANITY_CHECK_NOTHING();                                    \
     }
-}
-
-PERF_TEST_P(Size_MatType, CvRound_Float,
-            testing::Combine(testing::Values(TYPICAL_MAT_SIZES),
-                             testing::Values(CV_32FC1, CV_64FC1)))
-{
-    Size size = get<0>(GetParam());
-    int type = get<1>(GetParam()), depth = CV_MAT_DEPTH(type);
 
-    cv::Mat src(size, type), dst(size, CV_32SC1);
-
-    declare.in(src, WARMUP_RNG).out(dst);
-
-    if (depth == CV_32F)
-    {
-        TEST_CYCLE()
-            CvRoundMat<float>(src, dst);
-    }
-    else if (depth == CV_64F)
-    {
-        TEST_CYCLE()
-            CvRoundMat<double>(src, dst);
-    }
+DECL_ROUND_TEST(,cvRound,)
+DECL_ROUND_TEST(_Ceil,cvCeil,)
+DECL_ROUND_TEST(_Floor,cvFloor,)
 
-    SANITY_CHECK_NOTHING();
-}
+/* For FP classification tests, try to test them in way which uses
+   branching logic and avoids extra FP logic. */
+DECL_ROUND_TEST(_NaN,cvIsNaN, ? 1 : 2)
+DECL_ROUND_TEST(_Inf,cvIsInf, ? 1 : 2)
 
 } // namespace
diff --git a/modules/core/test/test_math.cpp b/modules/core/test/test_math.cpp
index d65b3fa39e..8b13a391cb 100644
--- a/modules/core/test/test_math.cpp
+++ b/modules/core/test/test_math.cpp
@@ -3923,5 +3923,59 @@ TEST(Core_SoftFloat, CvRound)
     }
 }
 
+template<typename T>
+static void checkRounding(T in, int outCeil, int outFloor)
+{
+    EXPECT_EQ(outCeil,cvCeil(in));
+    EXPECT_EQ(outFloor,cvFloor(in));
+
+    /* cvRound is not expected to be IEEE compliant. The implementation
+       should round to one of the above. */
+    EXPECT_TRUE((cvRound(in) == outCeil) || (cvRound(in) == outFloor));
+}
+
+TEST(Core_FastMath, InlineRoundingOps)
+{
+    struct
+    {
+        double in;
+        int outCeil;
+        int outFloor;
+    } values[] =
+    {
+        // Values are chosen to convert to binary float 32/64 exactly
+        { 1.0, 1, 1 },
+        { 1.5, 2, 1 },
+        { -1.5, -1, -2}
+    };
+
+    for (int i = 0, maxi = sizeof(values) / sizeof(values[0]); i < maxi; i++)
+    {
+        checkRounding<double>(values[i].in, values[i].outCeil, values[i].outFloor);
+        checkRounding<float>((float)values[i].in, values[i].outCeil, values[i].outFloor);
+    }
+}
+
+TEST(Core_FastMath, InlineNaN)
+{
+    EXPECT_EQ( cvIsNaN((float) NAN), 1);
+    EXPECT_EQ( cvIsNaN((float) -NAN), 1);
+    EXPECT_EQ( cvIsNaN(0.0f), 0);
+    EXPECT_EQ( cvIsNaN((double) NAN), 1);
+    EXPECT_EQ( cvIsNaN((double) -NAN), 1);
+    EXPECT_EQ( cvIsNaN(0.0), 0);
+}
+
+TEST(Core_FastMath, InlineIsInf)
+{
+    // Assume HUGE_VAL is infinity. Strictly speaking, may not always be true.
+    EXPECT_EQ( cvIsInf((float) HUGE_VAL), 1);
+    EXPECT_EQ( cvIsInf((float) -HUGE_VAL), 1);
+    EXPECT_EQ( cvIsInf(0.0f), 0);
+    EXPECT_EQ( cvIsInf((double) HUGE_VAL), 1);
+    EXPECT_EQ( cvIsInf((double) -HUGE_VAL), 1);
+    EXPECT_EQ( cvIsInf(0.0), 0);
+}
+
 }} // namespace
 /* End of file. */

From 3f92bcc11ab122b6cb5167c3548a0e2ce306c94d Mon Sep 17 00:00:00 2001
From: "Paul E. Murphy" <pmur@users.noreply.github.com>
Date: Mon, 22 Jul 2019 14:23:49 -0500
Subject: [PATCH 2/3] fast_math: selectively use GCC rounding builtins when
 available

Add a new macro definition OPENCV_USE_FASTMATH_GCC_BUILTINS to enable
usage of GCC inline math functions, if available and requested by the
user.

Likewise, enable it for POWER. This is nearly always a substantial
improvement over using integer manipulation as most operations can
be done in several instructions with no branching. The result is a
1.5-1.8x speedup in the ceil/floor operations.

1. As tested with AT 12.0-1 (GCC 8.3.1) compiler on P9 LE.
---
 .../core/include/opencv2/core/fast_math.hpp   | 29 +++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/modules/core/include/opencv2/core/fast_math.hpp b/modules/core/include/opencv2/core/fast_math.hpp
index d9ea28e632..6eb6f1fa00 100644
--- a/modules/core/include/opencv2/core/fast_math.hpp
+++ b/modules/core/include/opencv2/core/fast_math.hpp
@@ -92,6 +92,19 @@
     #define ARM_ROUND_FLT(value) ARM_ROUND(value, "vcvtr.s32.f32 %[temp], %[value]\n vmov %[res], %[temp]")
 #endif
 
+#if defined __PPC64__ && !defined OPENCV_USE_FASTMATH_GCC_BUILTINS
+    /* Let GCC inline C math functions when available. Dedicated hardware is available to
+       round and covert FP values. */
+    #define OPENCV_USE_FASTMATH_GCC_BUILTINS
+#endif
+
+/* Enable GCC builtin math functions if possible, desired, and available.
+   Note, not all math functions inline equally. E.g lrint will not inline
+   without the -fno-math-errno option. */
+#if defined OPENCV_USE_FASTMATH_GCC_BUILTINS && defined __GNUC__ && !defined __clang__ && !defined (__CUDACC__)
+    #define _OPENCV_FASTMATH_ENABLE_GCC_MATH_BUILTINS
+#endif
+
 /** @brief Rounds floating-point number to the nearest integer
 
  @param value floating-point number. If the value is outside of INT_MIN ... INT_MAX range, the
@@ -138,8 +151,12 @@ cvRound( double value )
  */
 CV_INLINE int cvFloor( double value )
 {
+#if defined _OPENCV_FASTMATH_ENABLE_GCC_MATH_BUILTINS
+    return __builtin_floor(value);
+#else
     int i = (int)value;
     return i - (i > value);
+#endif
 }
 
 /** @brief Rounds floating-point number to the nearest integer not smaller than the original.
@@ -151,8 +168,12 @@ CV_INLINE int cvFloor( double value )
  */
 CV_INLINE int cvCeil( double value )
 {
+#if defined _OPENCV_FASTMATH_ENABLE_GCC_MATH_BUILTINS
+    return __builtin_ceil(value);
+#else
     int i = (int)value;
     return i + (i < value);
+#endif
 }
 
 /** @brief Determines if the argument is Not A Number.
@@ -225,8 +246,12 @@ CV_INLINE int cvRound( int value )
 /** @overload */
 CV_INLINE int cvFloor( float value )
 {
+#if defined _OPENCV_FASTMATH_ENABLE_GCC_MATH_BUILTINS
+    return __builtin_floorf(value);
+#else
     int i = (int)value;
     return i - (i > value);
+#endif
 }
 
 /** @overload */
@@ -238,8 +263,12 @@ CV_INLINE int cvFloor( int value )
 /** @overload */
 CV_INLINE int cvCeil( float value )
 {
+#if defined _OPENCV_FASTMATH_ENABLE_GCC_MATH_BUILTINS
+    return __builtin_ceilf(value);
+#else
     int i = (int)value;
     return i + (i < value);
+#endif
 }
 
 /** @overload */

From f38a61c66d8e45ad0b5e21bdc6e55223d1cdb59e Mon Sep 17 00:00:00 2001
From: "Paul E. Murphy" <pmur@users.noreply.github.com>
Date: Mon, 22 Jul 2019 14:23:56 -0500
Subject: [PATCH 3/3] fast_math: implement optimized PPC routines

Implement cvRound using inline asm. No compiler support
exists today to properly optimize this. This results in
about a 4x speedup over the default rounding. Likewise,
simplify the growing number of rounding function overloads.

For P9 enabled targets, utilize the classification
testing instruction to test for Inf/Nan values. Operation
speedup is about 1.2x for FP32, and 1.5x for FP64 operands.

For P8 targets, fallback to the GCC nan inline. It provides
a 1.1/1.4x improvement for FP32/FP64 arguments.
---
 .../core/include/opencv2/core/fast_math.hpp   | 84 ++++++++++++++-----
 1 file changed, 62 insertions(+), 22 deletions(-)

diff --git a/modules/core/include/opencv2/core/fast_math.hpp b/modules/core/include/opencv2/core/fast_math.hpp
index 6eb6f1fa00..b1e8c4202d 100644
--- a/modules/core/include/opencv2/core/fast_math.hpp
+++ b/modules/core/include/opencv2/core/fast_math.hpp
@@ -74,7 +74,15 @@
 #  include "tegra_round.hpp"
 #endif
 
-#if defined __GNUC__ && defined __arm__ && (defined __ARM_PCS_VFP || defined __ARM_VFPV3__ || defined __ARM_NEON__) && !defined __SOFTFP__ && !defined(__CUDACC__)
+#if defined __PPC64__ && defined __GNUC__ && defined _ARCH_PWR8 && !defined (__CUDACC__)
+#  include <altivec.h>
+#endif
+
+#if ((defined _MSC_VER && defined _M_ARM) || defined CV_ICC || \
+        defined __GNUC__) && defined HAVE_TEGRA_OPTIMIZATION
+    #define CV_INLINE_ROUND_DBL(value) TEGRA_ROUND_DBL(value);
+    #define CV_INLINE_ROUND_FLT(value) TEGRA_ROUND_FLT(value);
+#elif defined __GNUC__ && defined __arm__ && (defined __ARM_PCS_VFP || defined __ARM_VFPV3__ || defined __ARM_NEON__) && !defined __SOFTFP__ && !defined(__CUDACC__)
     // 1. general scheme
     #define ARM_ROUND(_value, _asm_string) \
         int res; \
@@ -84,12 +92,32 @@
         return res
     // 2. version for double
     #ifdef __clang__
-        #define ARM_ROUND_DBL(value) ARM_ROUND(value, "vcvtr.s32.f64 %[temp], %[value] \n vmov %[res], %[temp]")
+        #define CV_INLINE_ROUND_DBL(value) ARM_ROUND(value, "vcvtr.s32.f64 %[temp], %[value] \n vmov %[res], %[temp]")
     #else
-        #define ARM_ROUND_DBL(value) ARM_ROUND(value, "vcvtr.s32.f64 %[temp], %P[value] \n vmov %[res], %[temp]")
+        #define CV_INLINE_ROUND_DBL(value) ARM_ROUND(value, "vcvtr.s32.f64 %[temp], %P[value] \n vmov %[res], %[temp]")
     #endif
     // 3. version for float
-    #define ARM_ROUND_FLT(value) ARM_ROUND(value, "vcvtr.s32.f32 %[temp], %[value]\n vmov %[res], %[temp]")
+    #define CV_INLINE_ROUND_FLT(value) ARM_ROUND(value, "vcvtr.s32.f32 %[temp], %[value]\n vmov %[res], %[temp]")
+#elif defined __PPC64__ && defined __GNUC__ && defined _ARCH_PWR8 && !defined (__CUDACC__)
+    // P8 and newer machines can convert fp32/64 to int quickly.
+    #define CV_INLINE_ROUND_DBL(value) \
+        int out; \
+        double temp; \
+        __asm__( "fctiw %[temp],%[in]\n\tmffprwz %[out],%[temp]\n\t" : [out] "=r" (out), [temp] "=d" (temp) : [in] "d" ((double)(value)) : ); \
+        return out;
+
+    // FP32 also works with FP64 routine above
+    #define CV_INLINE_ROUND_FLT(value) CV_INLINE_ROUND_DBL(value)
+
+    #ifdef _ARCH_PWR9
+        #define CV_INLINE_ISINF_DBL(value) return scalar_test_data_class(value, 0x30);
+        #define CV_INLINE_ISNAN_DBL(value) return scalar_test_data_class(value, 0x40);
+        #define CV_INLINE_ISINF_FLT(value) CV_INLINE_ISINF_DBL(value)
+        #define CV_INLINE_ISNAN_FLT(value) CV_INLINE_ISNAN_DBL(value)
+    #endif
+#elif defined CV_ICC || defined __GNUC__
+    #define CV_INLINE_ROUND_DBL(value) return (int)(lrint(value));
+    #define CV_INLINE_ROUND_FLT(value) return (int)(lrintf(value));
 #endif
 
 #if defined __PPC64__ && !defined OPENCV_USE_FASTMATH_GCC_BUILTINS
@@ -105,6 +133,16 @@
     #define _OPENCV_FASTMATH_ENABLE_GCC_MATH_BUILTINS
 #endif
 
+/* Allow overrides for some functions which may benefit from tuning. Likewise,
+   note that isinf is not used as the return value is signed. */
+#if defined _OPENCV_FASTMATH_ENABLE_GCC_MATH_BUILTINS && !defined CV_INLINE_ISNAN_DBL
+    #define CV_INLINE_ISNAN_DBL(value) return __builtin_isnan(value);
+#endif
+
+#if defined _OPENCV_FASTMATH_ENABLE_GCC_MATH_BUILTINS && !defined CV_INLINE_ISNAN_FLT
+    #define CV_INLINE_ISNAN_FLT(value) return __builtin_isnanf(value);
+#endif
+
 /** @brief Rounds floating-point number to the nearest integer
 
  @param value floating-point number. If the value is outside of INT_MIN ... INT_MAX range, the
@@ -125,15 +163,8 @@ cvRound( double value )
         fistp t;
     }
     return t;
-#elif ((defined _MSC_VER && defined _M_ARM) || defined CV_ICC || \
-        defined __GNUC__) && defined HAVE_TEGRA_OPTIMIZATION
-    TEGRA_ROUND_DBL(value);
-#elif defined CV_ICC || defined __GNUC__
-# if defined ARM_ROUND_DBL
-    ARM_ROUND_DBL(value);
-# else
-    return (int)lrint(value);
-# endif
+#elif defined CV_INLINE_ROUND_DBL
+    CV_INLINE_ROUND_DBL(value);
 #else
     /* it's ok if round does not comply with IEEE754 standard;
        the tests should allow +/-1 difference when the tested functions use round */
@@ -184,10 +215,14 @@ CV_INLINE int cvCeil( double value )
  otherwise. */
 CV_INLINE int cvIsNaN( double value )
 {
+#if defined CV_INLINE_ISNAN_DBL
+    CV_INLINE_ISNAN_DBL(value);
+#else
     Cv64suf ieee754;
     ieee754.f = value;
     return ((unsigned)(ieee754.u >> 32) & 0x7fffffff) +
            ((unsigned)ieee754.u != 0) > 0x7ff00000;
+#endif
 }
 
 /** @brief Determines if the argument is Infinity.
@@ -198,10 +233,14 @@ CV_INLINE int cvIsNaN( double value )
  and 0 otherwise. */
 CV_INLINE int cvIsInf( double value )
 {
+#if defined CV_INLINE_ISINF_DBL
+    CV_INLINE_ISINF_DBL(value);
+#else
     Cv64suf ieee754;
     ieee754.f = value;
     return ((unsigned)(ieee754.u >> 32) & 0x7fffffff) == 0x7ff00000 &&
             (unsigned)ieee754.u == 0;
+#endif
 }
 
 #ifdef __cplusplus
@@ -221,15 +260,8 @@ CV_INLINE int cvRound(float value)
         fistp t;
     }
     return t;
-#elif ((defined _MSC_VER && defined _M_ARM) || defined CV_ICC || \
-        defined __GNUC__) && defined HAVE_TEGRA_OPTIMIZATION
-    TEGRA_ROUND_FLT(value);
-#elif defined CV_ICC || defined __GNUC__
-# if defined ARM_ROUND_FLT
-    ARM_ROUND_FLT(value);
-# else
-    return (int)lrintf(value);
-# endif
+#elif defined CV_INLINE_ROUND_FLT
+    CV_INLINE_ROUND_FLT(value);
 #else
     /* it's ok if round does not comply with IEEE754 standard;
      the tests should allow +/-1 difference when the tested functions use round */
@@ -280,17 +312,25 @@ CV_INLINE int cvCeil( int value )
 /** @overload */
 CV_INLINE int cvIsNaN( float value )
 {
+#if defined CV_INLINE_ISNAN_FLT
+    CV_INLINE_ISNAN_FLT(value);
+#else
     Cv32suf ieee754;
     ieee754.f = value;
     return (ieee754.u & 0x7fffffff) > 0x7f800000;
+#endif
 }
 
 /** @overload */
 CV_INLINE int cvIsInf( float value )
 {
+#if defined CV_INLINE_ISINF_FLT
+    CV_INLINE_ISINF_FLT(value);
+#else
     Cv32suf ieee754;
     ieee754.f = value;
     return (ieee754.u & 0x7fffffff) == 0x7f800000;
+#endif
 }
 
 #endif // __cplusplus