From 15783d65981d8978597c6b60e830e21e964cbdf9 Mon Sep 17 00:00:00 2001
From: Alexander Smorkalov <2536374+asmorkalov@users.noreply.github.com>
Date: Fri, 12 Jul 2024 15:03:33 +0300
Subject: [PATCH] Merge pull request #25792 from
 asmorkalov:as/HAL_fast_GaussianBlur

Added flag to GaussianBlur for faster but not bit-exact implementation #25792

Rationale:
Current implementation of GaussianBlur is almost always bit-exact. It helps to get predictable results according platforms, but prohibits most of approximations and optimization tricks.

The patch converts `borderType` parameter to more generic `flags` and introduces `GAUSS_ALLOW_APPROXIMATIONS` flag to allow not bit-exact implementation. With the flag IPP and generic HAL implementation are called first. The flag naming and location is a subject for discussion.

Replaces https://github.com/opencv/opencv/pull/22073
Possibly related issue: https://github.com/opencv/opencv/issues/24135

### Pull Request Readiness Checklist

See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request

- [x] I agree to contribute to the project under Apache 2 License.
- [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [x] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [x] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
      Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
---
 CMakeLists.txt                                |  6 +-
 .../config_reference.markdown                 |  1 +
 modules/core/CMakeLists.txt                   |  4 ++
 modules/core/include/opencv2/core.hpp         | 12 ++++
 modules/core/src/system.cpp                   |  9 +++
 modules/imgproc/include/opencv2/imgproc.hpp   |  4 +-
 modules/imgproc/src/smooth.dispatch.cpp       | 58 ++++++++++++++++---
 modules/imgproc/test/test_smooth_bitexact.cpp | 56 +++++++++++++++++-
 modules/python/test/test_misc.py              |  4 ++
 modules/ts/src/ts.cpp                         |  1 +
 10 files changed, 143 insertions(+), 12 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 30b205ecd8..c196d0f2be 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1258,7 +1258,11 @@ if(CMAKE_GENERATOR MATCHES "Xcode|Visual Studio|Multi-Config")
 else()
   status("    Configuration:"  ${CMAKE_BUILD_TYPE})
 endif()
-
+if(DEFINED OPENCV_ALGO_HINT_DEFAULT)
+  status("    Algorithm Hint:"  ${OPENCV_ALGO_HINT_DEFAULT})
+else()
+  status("    Algorithm Hint:" " ALGO_ACCURATE")
+endif()
 
 # ========================= CPU code generation mode =========================
 status("")
diff --git a/doc/tutorials/introduction/config_reference/config_reference.markdown b/doc/tutorials/introduction/config_reference/config_reference.markdown
index 7ced9a2536..e43b8793e5 100644
--- a/doc/tutorials/introduction/config_reference/config_reference.markdown
+++ b/doc/tutorials/introduction/config_reference/config_reference.markdown
@@ -217,6 +217,7 @@ Following options can be used to produce special builds with instrumentation or
 | `ENABLE_BUILD_HARDENING` | GCC, Clang, MSVC | Enable compiler options which reduce possibility of code exploitation.  |
 | `ENABLE_LTO` | GCC, Clang, MSVC | Enable Link Time Optimization (LTO). |
 | `ENABLE_THIN_LTO` | Clang | Enable thin LTO which incorporates intermediate bitcode to binaries allowing consumers optimize their applications later. |
+| `OPENCV_ALGO_HINT_DEFAULT` | Any | Set default OpenCV implementation hint value: `ALGO_ACCURATE` or `ALGO_APROX`. Dangerous! The option  changes behaviour globally and may affect accuracy of many algorithms. |
 
 @see [GCC instrumentation](https://gcc.gnu.org/onlinedocs/gcc/Instrumentation-Options.html)
 @see [Build hardening](https://en.wikipedia.org/wiki/Hardening_(computing))
diff --git a/modules/core/CMakeLists.txt b/modules/core/CMakeLists.txt
index 16f32c994a..ea1100c954 100644
--- a/modules/core/CMakeLists.txt
+++ b/modules/core/CMakeLists.txt
@@ -186,6 +186,10 @@ if(OPENCV_SEMIHOSTING)
   ocv_target_compile_definitions(${the_module} PRIVATE "-DOPENCV_SEMIHOSTING")
 endif(OPENCV_SEMIHOSTING)
 
+if(DEFINED OPENCV_ALGO_HINT_DEFAULT)
+  ocv_target_compile_definitions(${the_module} PRIVATE "-DOPENCV_ALGO_HINT_DEFAULT=${OPENCV_ALGO_HINT_DEFAULT}")
+endif(DEFINED OPENCV_ALGO_HINT_DEFAULT)
+
 if(HAVE_HPX)
   ocv_target_link_libraries(${the_module} LINK_PRIVATE "${HPX_LIBRARIES}")
 endif()
diff --git a/modules/core/include/opencv2/core.hpp b/modules/core/include/opencv2/core.hpp
index b58a3a6ccb..4bfb95fede 100644
--- a/modules/core/include/opencv2/core.hpp
+++ b/modules/core/include/opencv2/core.hpp
@@ -150,6 +150,18 @@ It is possible to alternate error processing by using #redirectError().
  */
 CV_EXPORTS CV_NORETURN void error(const Exception& exc);
 
+/*! @brief Flags that allow to midify some functions behavior. Used as set of flags.
+*/
+enum AlgorithmHint {
+    ALGO_DEFAULT = 0, //!< Default algorithm behaviour defined during OpenCV build
+    ALGO_ACCURATE = 1, //!< Use generic portable implementation
+    ALGO_APPROX = 2, //!< Allow alternative approximations to get faster implementation. Behaviour and result depends on a platform
+};
+
+/*! @brief Returns ImplementationHint selected by default, a.k.a. `IMPL_DEFAULT` defined during OpenCV compilation.
+ */
+CV_EXPORTS_W AlgorithmHint getDefaultAlgorithmHint();
+
 enum SortFlags { SORT_EVERY_ROW    = 0, //!< each matrix row is sorted independently
                  SORT_EVERY_COLUMN = 1, //!< each matrix column is sorted
                                         //!< independently; this flag and the previous one are
diff --git a/modules/core/src/system.cpp b/modules/core/src/system.cpp
index 8227175b6a..eccef84c92 100644
--- a/modules/core/src/system.cpp
+++ b/modules/core/src/system.cpp
@@ -46,6 +46,7 @@
 #include <iostream>
 #include <ostream>
 
+#include <opencv2/core.hpp>
 #include <opencv2/core/utils/configuration.private.hpp>
 #include <opencv2/core/utils/trace.private.hpp>
 
@@ -2888,6 +2889,14 @@ bool restoreFPDenormalsState(const FPDenormalsModeState& state)
 
 }  // namespace details
 
+AlgorithmHint getDefaultAlgorithmHint()
+{
+#ifdef OPENCV_ALGO_HINT_DEFAULT
+    return OPENCV_ALGO_HINT_DEFAULT;
+#else
+    return ALGO_ACCURATE;
+#endif
+};
 
 } // namespace cv
 
diff --git a/modules/imgproc/include/opencv2/imgproc.hpp b/modules/imgproc/include/opencv2/imgproc.hpp
index 2f3c6f344f..53ff5ea6bd 100644
--- a/modules/imgproc/include/opencv2/imgproc.hpp
+++ b/modules/imgproc/include/opencv2/imgproc.hpp
@@ -1536,12 +1536,14 @@ respectively (see #getGaussianKernel for details); to fully control the result r
 possible future modifications of all this semantics, it is recommended to specify all of ksize,
 sigmaX, and sigmaY.
 @param borderType pixel extrapolation method, see #BorderTypes. #BORDER_WRAP is not supported.
+@param hint Implementation modfication flags. See #AlgorithmHint
 
 @sa  sepFilter2D, filter2D, blur, boxFilter, bilateralFilter, medianBlur
  */
 CV_EXPORTS_W void GaussianBlur( InputArray src, OutputArray dst, Size ksize,
                                 double sigmaX, double sigmaY = 0,
-                                int borderType = BORDER_DEFAULT );
+                                int borderType = BORDER_DEFAULT,
+                                AlgorithmHint hint = cv::ALGO_DEFAULT );
 
 /** @brief Applies the bilateral filter to an image.
 
diff --git a/modules/imgproc/src/smooth.dispatch.cpp b/modules/imgproc/src/smooth.dispatch.cpp
index d0f50a73bb..6bc989e520 100644
--- a/modules/imgproc/src/smooth.dispatch.cpp
+++ b/modules/imgproc/src/smooth.dispatch.cpp
@@ -468,7 +468,7 @@ static bool openvx_gaussianBlur(InputArray _src, OutputArray _dst, Size ksize,
 
 #endif
 
-#if defined ENABLE_IPP_GAUSSIAN_BLUR  // see CMake's OPENCV_IPP_GAUSSIAN_BLUR option
+#ifdef ENABLE_IPP_GAUSSIAN_BLUR  // see CMake's OPENCV_IPP_GAUSSIAN_BLUR option
 
 #define IPP_DISABLE_GAUSSIAN_BLUR_LARGE_KERNELS_1TH 1
 #define IPP_DISABLE_GAUSSIAN_BLUR_16SC4_1TH 1
@@ -526,14 +526,14 @@ private:
 
 #endif
 
-static bool ipp_GaussianBlur(InputArray _src, OutputArray _dst, Size ksize,
+static bool ipp_GaussianBlur(cv::Mat& src, cv::Mat& dst, Size ksize,
                    double sigma1, double sigma2, int borderType )
 {
 #ifdef HAVE_IPP_IW
     CV_INSTRUMENT_REGION_IPP();
 
 #if IPP_VERSION_X100 < 201800 && ((defined _MSC_VER && defined _M_IX86) || (defined __GNUC__ && defined __i386__))
-    CV_UNUSED(_src); CV_UNUSED(_dst); CV_UNUSED(ksize); CV_UNUSED(sigma1); CV_UNUSED(sigma2); CV_UNUSED(borderType);
+    CV_UNUSED(src); CV_UNUSED(dst); CV_UNUSED(ksize); CV_UNUSED(sigma1); CV_UNUSED(sigma2); CV_UNUSED(borderType);
     return false; // bug on ia32
 #else
     if(sigma1 != sigma2)
@@ -548,8 +548,6 @@ static bool ipp_GaussianBlur(InputArray _src, OutputArray _dst, Size ksize,
     // Acquire data and begin processing
     try
     {
-        Mat src = _src.getMat();
-        Mat dst = _dst.getMat();
         ::ipp::IwiImage       iwSrc      = ippiGetImage(src);
         ::ipp::IwiImage       iwDst      = ippiGetImage(dst);
         ::ipp::IwiBorderSize  borderSize = ::ipp::iwiSizeToBorderSize(ippiGetSize(ksize));
@@ -589,7 +587,7 @@ static bool ipp_GaussianBlur(InputArray _src, OutputArray _dst, Size ksize,
     return true;
 #endif
 #else
-    CV_UNUSED(_src); CV_UNUSED(_dst); CV_UNUSED(ksize); CV_UNUSED(sigma1); CV_UNUSED(sigma2); CV_UNUSED(borderType);
+    CV_UNUSED(src); CV_UNUSED(dst); CV_UNUSED(ksize); CV_UNUSED(sigma1); CV_UNUSED(sigma2); CV_UNUSED(borderType);
     return false;
 #endif
 }
@@ -610,10 +608,13 @@ static bool validateGaussianBlurKernel(std::vector<T>& kernel)
 
 void GaussianBlur(InputArray _src, OutputArray _dst, Size ksize,
                   double sigma1, double sigma2,
-                  int borderType)
+                  int borderType, AlgorithmHint hint)
 {
     CV_INSTRUMENT_REGION();
 
+    if (hint == cv::ALGO_DEFAULT)
+        hint = cv::getDefaultAlgorithmHint();
+
     CV_Assert(!_src.empty());
 
     int type = _src.type();
@@ -693,7 +694,27 @@ void GaussianBlur(InputArray _src, OutputArray _dst, Size ksize,
                     src2.locateROI( wsz, ofs );
 
                 CALL_HAL(gaussianBlurBinomial, cv_hal_gaussianBlurBinomial, src2.ptr(), src2.step, dst.ptr(), dst.step, src2.cols, src2.rows, sdepth, cn,
-                         ofs.x, ofs.y, wsz.width - src2.cols - ofs.x,  wsz.height - src2.rows - ofs.y, ksize.width, borderType&~BORDER_ISOLATED);
+                         ofs.x, ofs.y, wsz.width - src2.cols - ofs.x,  wsz.height - src2.rows - ofs.y, ksize.width,
+                         borderType & ~BORDER_ISOLATED);
+            }
+
+            if (hint == ALGO_APPROX)
+            {
+                Point ofs;
+                Size wsz(src.cols, src.rows);
+                if(!(borderType & BORDER_ISOLATED))
+                    src.locateROI( wsz, ofs );
+
+                CALL_HAL(gaussianBlur, cv_hal_gaussianBlur, src.ptr(), src.step, dst.ptr(), dst.step, src.cols, src.rows, sdepth, cn,
+                        ofs.x, ofs.y, wsz.width - src.cols - ofs.x, wsz.height - src.rows - ofs.y, ksize.width, ksize.height,
+                        sigma1, sigma2, borderType & ~BORDER_ISOLATED);
+
+#ifdef ENABLE_IPP_GAUSSIAN_BLUR
+                // IPP is not bit-exact to OpenCV implementation
+                CV_IPP_RUN_FAST(ipp_GaussianBlur(src, dst, ksize, sigma1, sigma2, borderType));
+#endif
+                CV_OVX_RUN(true,
+                        openvx_gaussianBlur(src, dst, ksize, sigma1, sigma2, borderType))
             }
 
             CV_CPU_DISPATCH(GaussianBlurFixedPoint, (src, dst, (const uint16_t*)&fkx[0], (int)fkx.size(), (const uint16_t*)&fky[0], (int)fky.size(), borderType),
@@ -747,6 +768,25 @@ void GaussianBlur(InputArray _src, OutputArray _dst, Size ksize,
                          ofs.x, ofs.y, wsz.width - src2.cols - ofs.x,  wsz.height - src2.rows - ofs.y, ksize.width, borderType&~BORDER_ISOLATED);
             }
 
+            if (hint == ALGO_APPROX)
+            {
+                Point ofs;
+                Size wsz(src.cols, src.rows);
+                if(!(borderType & BORDER_ISOLATED))
+                    src.locateROI( wsz, ofs );
+
+                CALL_HAL(gaussianBlur, cv_hal_gaussianBlur, src.ptr(), src.step, dst.ptr(), dst.step, src.cols, src.rows, sdepth, cn,
+                        ofs.x, ofs.y, wsz.width - src.cols - ofs.x, wsz.height - src.rows - ofs.y, ksize.width, ksize.height,
+                        sigma1, sigma2, borderType & ~BORDER_ISOLATED);
+
+#ifdef ENABLE_IPP_GAUSSIAN_BLUR
+                // IPP is not bit-exact to OpenCV implementation
+                CV_IPP_RUN_FAST(ipp_GaussianBlur(src, dst, ksize, sigma1, sigma2, borderType));
+#endif
+                CV_OVX_RUN(true,
+                        openvx_gaussianBlur(src, dst, ksize, sigma1, sigma2, borderType))
+            }
+
             CV_CPU_DISPATCH(GaussianBlurFixedPoint, (src, dst, (const uint32_t*)&fkx[0], (int)fkx.size(), (const uint32_t*)&fky[0], (int)fky.size(), borderType),
                 CV_CPU_DISPATCH_MODES_ALL);
 
@@ -772,7 +812,7 @@ void GaussianBlur(InputArray _src, OutputArray _dst, Size ksize,
 
     CALL_HAL(gaussianBlur, cv_hal_gaussianBlur, src.ptr(), src.step, dst.ptr(), dst.step, src.cols, src.rows, sdepth, cn,
              ofs.x, ofs.y, wsz.width - src.cols - ofs.x, wsz.height - src.rows - ofs.y, ksize.width, ksize.height,
-             sigma1, sigma2, borderType&~BORDER_ISOLATED);
+             sigma1, sigma2, borderType & ~BORDER_ISOLATED);
 
     CV_OVX_RUN(true,
                openvx_gaussianBlur(src, dst, ksize, sigma1, sigma2, borderType))
diff --git a/modules/imgproc/test/test_smooth_bitexact.cpp b/modules/imgproc/test/test_smooth_bitexact.cpp
index d4ae2af833..2d1f7b5a4e 100644
--- a/modules/imgproc/test/test_smooth_bitexact.cpp
+++ b/modules/imgproc/test/test_smooth_bitexact.cpp
@@ -244,7 +244,7 @@ static void checkGaussianBlur_8Uvs32F(const Mat& src8u, const Mat& src32f, int N
 TEST(GaussianBlur_Bitexact, regression_9863)
 {
     Mat src8u = imread(cvtest::findDataFile("shared/lena.png"));
-     Mat src32f; src8u.convertTo(src32f, CV_32F);
+    Mat src32f; src8u.convertTo(src32f, CV_32F);
 
     checkGaussianBlur_8Uvs32F(src8u, src32f, 151, 30);
 }
@@ -260,4 +260,58 @@ TEST(GaussianBlur_Bitexact, overflow_20792)
     EXPECT_GT(count, nintyPercent);
 }
 
+CV_ENUM(GaussInputType, CV_8U, CV_16S);
+CV_ENUM(GaussBorder, BORDER_CONSTANT, BORDER_REPLICATE, BORDER_REFLECT_101);
+
+struct GaussianBlurVsBitexact: public testing::TestWithParam<tuple<GaussInputType, int, double, GaussBorder>>
+{
+    virtual void SetUp()
+    {
+        orig = imread(findDataFile("shared/lena.png"));
+        EXPECT_FALSE(orig.empty()) << "Cannot find test image shared/lena.png";
+    }
+
+    Mat orig;
+};
+
+// NOTE: The test was designed for IPP (-DOPENCV_IPP_GAUSSIAN_BLUR=ON)
+// Should be extended after new HAL integration
+TEST_P(GaussianBlurVsBitexact, approx)
+{
+    auto testParams = GetParam();
+    int dtype = get<0>(testParams);
+    int ksize = get<1>(testParams);
+    double sigma = get<2>(testParams);
+    int border = get<3>(testParams);
+
+    Mat src;
+    orig.convertTo(src, dtype);
+
+    cv::Mat gt;
+    GaussianBlur(src, gt, Size(ksize, ksize), sigma, sigma, border, ALGO_ACCURATE);
+
+    cv::Mat dst;
+    GaussianBlur(src, dst, Size(ksize, ksize), sigma, sigma, border, ALGO_APPROX);
+
+    cv::Mat diff;
+    cv::absdiff(dst, gt, diff);
+    cv::Mat flatten_diff = diff.reshape(1, diff.rows);
+
+    int nz = countNonZero(flatten_diff);
+    EXPECT_LE(nz, 0.06*src.total()); // Less 6% of different pixels
+
+    double min_val, max_val;
+    minMaxLoc(flatten_diff, &min_val, &max_val);
+    EXPECT_LE(max_val, 2); // expectes results floating +-1
+}
+
+INSTANTIATE_TEST_CASE_P(/*nothing*/, GaussianBlurVsBitexact,
+    testing::Combine(
+        GaussInputType::all(),
+        testing::Values(3, 5, 7),
+        testing::Values(0.75, 1.25),
+        GaussBorder::all()
+    )
+);
+
 }} // namespace
diff --git a/modules/python/test/test_misc.py b/modules/python/test/test_misc.py
index 08ab04d53d..ac2b02f875 100644
--- a/modules/python/test/test_misc.py
+++ b/modules/python/test/test_misc.py
@@ -987,6 +987,10 @@ class SamplesFindFile(NewOpenCVTests):
         except cv.error as _e:
             pass
 
+class AlgorithmImplHit(NewOpenCVTests):
+    def test_callable(self):
+        res = cv.getDefaultAlgorithmHint()
+        self.assertTrue(res is not None)
 
 if __name__ == '__main__':
     NewOpenCVTests.bootstrap()
diff --git a/modules/ts/src/ts.cpp b/modules/ts/src/ts.cpp
index fb60a18ff1..9fe5cb3937 100644
--- a/modules/ts/src/ts.cpp
+++ b/modules/ts/src/ts.cpp
@@ -1126,6 +1126,7 @@ void SystemInfoCollector::OnTestProgramStart(const testing::UnitTest&)
     recordPropertyVerbose("cv_vcs_version", "OpenCV VCS version", getSnippetFromConfig("Version control:", "\n"));
     recordPropertyVerbose("cv_build_type", "Build type", getSnippetFromConfig("Configuration:", "\n"), CV_TEST_BUILD_CONFIG);
     recordPropertyVerbose("cv_compiler", "Compiler", getSnippetFromConfig("C++ Compiler:", "\n"));
+    recordPropertyVerbose("implementation_hint", "Algorithm hint", getSnippetFromConfig("Algorithm Hint:", "\n"));
     const char* parallelFramework = cv::currentParallelFramework();
     if (parallelFramework)
     {