From 8b2783e9ffe3b6a5f9a84b34cc72da0c71d41a3a Mon Sep 17 00:00:00 2001
From: Christine Poerschke <6458642+cpoerschke@users.noreply.github.com>
Date: Sat, 25 May 2024 08:53:33 +0100
Subject: [PATCH 01/17] replace lena.jpg in find-existing-file tests

---
 modules/core/test/test_utils.cpp | 2 +-
 modules/python/test/test_misc.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/modules/core/test/test_utils.cpp b/modules/core/test/test_utils.cpp
index a43ea78381..13720c2b00 100644
--- a/modules/core/test/test_utils.cpp
+++ b/modules/core/test/test_utils.cpp
@@ -345,7 +345,7 @@ TEST(Samples, findFile)
 {
     cv::utils::logging::LogLevel prev = cv::utils::logging::setLogLevel(cv::utils::logging::LOG_LEVEL_VERBOSE);
     cv::String path;
-    ASSERT_NO_THROW(path = samples::findFile("lena.jpg", false));
+    ASSERT_NO_THROW(path = samples::findFile("HappyFish.jpg", false));
     EXPECT_NE(std::string(), path.c_str());
     cv::utils::logging::setLogLevel(prev);
 }
diff --git a/modules/python/test/test_misc.py b/modules/python/test/test_misc.py
index 08ab04d53d..ec86663e47 100644
--- a/modules/python/test/test_misc.py
+++ b/modules/python/test/test_misc.py
@@ -973,7 +973,7 @@ class CanUsePurePythonModuleFunction(NewOpenCVTests):
 class SamplesFindFile(NewOpenCVTests):
 
     def test_ExistedFile(self):
-        res = cv.samples.findFile('lena.jpg', False)
+        res = cv.samples.findFile('HappyFish.jpg', False)
         self.assertNotEqual(res, '')
 
     def test_MissingFile(self):

From db3654ef51b156feab4f59c13f2ee41ca2ab9a85 Mon Sep 17 00:00:00 2001
From: Kumataro <Kumataro@users.noreply.github.com>
Date: Sun, 21 Jul 2024 10:00:29 +0900
Subject: [PATCH 02/17] python: prefer cv::Mat over cv::UMat in python binding

---
 modules/python/src2/gen2.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/modules/python/src2/gen2.py b/modules/python/src2/gen2.py
index 3249c57f82..af187e5d3f 100755
--- a/modules/python/src2/gen2.py
+++ b/modules/python/src2/gen2.py
@@ -854,7 +854,22 @@ class FuncInfo(object):
 
         all_code_variants = []
 
+        # See https://github.com/opencv/opencv/issues/25928
+        # Conversion to UMat is expensive more than conversion to Mat.
+        # To reduce this cost, conversion to Mat is prefer than to UMat.
+        variants = []
+        variants_umat = []
         for v in self.variants:
+            hasUMat = False
+            for a in v.args:
+                hasUMat = hasUMat or "UMat" in a.tp
+            if hasUMat :
+                variants_umat.append(v)
+            else:
+                variants.append(v)
+        variants.extend(variants_umat)
+
+        for v in variants:
             code_decl = ""
             code_ret = ""
             code_cvt_list = []

From 0b3dbdd4b3395674fe9d05162ab916b83e683559 Mon Sep 17 00:00:00 2001
From: Alexander Lyulkov <alexander.lyulkov@opencv.ai>
Date: Thu, 25 Jul 2024 16:47:41 +0300
Subject: [PATCH 03/17] Added Java ORB test

---
 .../java/test/ORBFeatureDetectorTest.java     | 39 +++++++++++++++++++
 1 file changed, 39 insertions(+)

diff --git a/modules/features2d/misc/java/test/ORBFeatureDetectorTest.java b/modules/features2d/misc/java/test/ORBFeatureDetectorTest.java
index 7399253af6..2cd9977fb2 100644
--- a/modules/features2d/misc/java/test/ORBFeatureDetectorTest.java
+++ b/modules/features2d/misc/java/test/ORBFeatureDetectorTest.java
@@ -1,5 +1,13 @@
 package org.opencv.test.features2d;
 
+import org.junit.Assert;
+import org.opencv.core.CvType;
+import org.opencv.core.KeyPoint;
+import org.opencv.core.Mat;
+import org.opencv.core.MatOfKeyPoint;
+import org.opencv.core.Scalar;
+import org.opencv.features2d.Features2d;
+import org.opencv.features2d.ORB;
 import org.opencv.test.OpenCVTestCase;
 
 public class ORBFeatureDetectorTest extends OpenCVTestCase {
@@ -36,4 +44,35 @@ public class ORBFeatureDetectorTest extends OpenCVTestCase {
         fail("Not yet implemented");
     }
 
+    public void testDetectTwoPoints() {
+        Mat img = new Mat(256,256, CvType.CV_8UC3, new Scalar(0,0,0));
+        img.put(35, 40, 255,255, 255);
+        img.put(152, 98, 200,0, 0);
+
+        MatOfKeyPoint keypoints = new MatOfKeyPoint();
+        ORB orb = ORB.create();
+        Mat descriptors = new Mat();
+        orb.detectAndCompute(img, new Mat(), keypoints, descriptors);
+
+        KeyPoint[] keypointsArray = keypoints.toArray();
+        assertEquals(2, keypointsArray.length);
+
+        long x1 = Math.round(keypointsArray[0].pt.x);
+        long y1 = Math.round(keypointsArray[0].pt.y);
+        long x2 = Math.round(keypointsArray[1].pt.x);
+        long y2 = Math.round(keypointsArray[1].pt.y);
+
+        if (x2 > x1) {
+            assertEquals(40, x1);
+            assertEquals(35, y1);
+            assertEquals(98, x2);
+            assertEquals(152, y2);
+        } else {
+            assertEquals(40, x2);
+            assertEquals(35, y2);
+            assertEquals(98, x1);
+            assertEquals(152, y1);
+        }
+    }
+
 }

From be3c519956296bb9254040a231bd63454d37958c Mon Sep 17 00:00:00 2001
From: Kumataro <Kumataro@users.noreply.github.com>
Date: Fri, 26 Jul 2024 05:55:00 +0900
Subject: [PATCH 04/17] core: FileStorage: detect invalid attribute value

---
 modules/core/src/persistence_xml.cpp |  2 ++
 modules/core/test/test_io.cpp        | 18 ++++++++++++++++++
 2 files changed, 20 insertions(+)

diff --git a/modules/core/src/persistence_xml.cpp b/modules/core/src/persistence_xml.cpp
index 6141fade2d..ed699758fc 100644
--- a/modules/core/src/persistence_xml.cpp
+++ b/modules/core/src/persistence_xml.cpp
@@ -737,6 +737,8 @@ public:
                 if( c != '\"' && c != '\'' )
                 {
                     ptr = skipSpaces( ptr, CV_XML_INSIDE_TAG );
+                    if(!ptr)
+                        CV_PARSE_ERROR_CPP("Invalid attribute value");
                     if( *ptr != '\"' && *ptr != '\'' )
                         CV_PARSE_ERROR_CPP( "Attribute value should be put into single or double quotes" );
                 }
diff --git a/modules/core/test/test_io.cpp b/modules/core/test/test_io.cpp
index 16b66e75ee..d7be6e08c6 100644
--- a/modules/core/test/test_io.cpp
+++ b/modules/core/test/test_io.cpp
@@ -1985,4 +1985,22 @@ INSTANTIATE_TEST_CASE_P( /*nothing*/,
     Core_InputOutput_regression_25073,
     Values("test.json", "test.xml", "test.yml") );
 
+// see https://github.com/opencv/opencv/issues/25946
+TEST(Core_InputOutput, FileStorage_invalid_attribute_value_regression_25946)
+{
+    const std::string fileName = cv::tempfile("FileStorage_invalid_attribute_value_exception_test.xml");
+    const std::string content = "<?xml \n_=";
+
+    std::fstream testFile;
+    testFile.open(fileName.c_str(), std::fstream::out);
+    if(!testFile.is_open()) FAIL();
+    testFile << content;
+    testFile.close();
+
+    FileStorage fs;
+    EXPECT_ANY_THROW( fs.open(fileName, FileStorage::READ + FileStorage::FORMAT_XML) );
+
+    ASSERT_EQ(0, std::remove(fileName.c_str()));
+}
+
 }} // namespace

From 99672a2691d96f5b4782283aec8f3429c01aefb7 Mon Sep 17 00:00:00 2001
From: gblikas <gblikas@gmail.com>
Date: Fri, 26 Jul 2024 13:24:26 -0700
Subject: [PATCH 05/17] fix: js perf tests

modules/js/perf/perf_helpfunc.js and target tests, e.g. perf_gaussianBlur.js contained "const isNodeJs", leading to re-definition when using associated *.html files.
---
 modules/js/perf/perf_64bits.js                       | 2 +-
 modules/js/perf/perf_helpfunc.js                     | 2 +-
 modules/js/perf/perf_imgproc/perf_blur.js            | 2 +-
 modules/js/perf/perf_imgproc/perf_cvtcolor.js        | 2 +-
 modules/js/perf/perf_imgproc/perf_dilate.js          | 2 +-
 modules/js/perf/perf_imgproc/perf_erode.js           | 2 +-
 modules/js/perf/perf_imgproc/perf_filter2D.js        | 2 +-
 modules/js/perf/perf_imgproc/perf_gaussianBlur.js    | 2 +-
 modules/js/perf/perf_imgproc/perf_medianBlur.js      | 2 +-
 modules/js/perf/perf_imgproc/perf_pyrDown.js         | 2 +-
 modules/js/perf/perf_imgproc/perf_remap.js           | 2 +-
 modules/js/perf/perf_imgproc/perf_resize.js          | 2 +-
 modules/js/perf/perf_imgproc/perf_scharr.js          | 2 +-
 modules/js/perf/perf_imgproc/perf_sobel.js           | 2 +-
 modules/js/perf/perf_imgproc/perf_threshold.js       | 2 +-
 modules/js/perf/perf_imgproc/perf_warpAffine.js      | 2 +-
 modules/js/perf/perf_imgproc/perf_warpPerspective.js | 2 +-
 17 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/modules/js/perf/perf_64bits.js b/modules/js/perf/perf_64bits.js
index dc4e234d4c..de75921e20 100644
--- a/modules/js/perf/perf_64bits.js
+++ b/modules/js/perf/perf_64bits.js
@@ -1,4 +1,4 @@
-const isNodeJs = (typeof window) === 'undefined'? true : false;
+var isNodeJs = (typeof window) === 'undefined'? true : false;
 
 if (isNodeJs) {
   var Benchmark = require('benchmark');
diff --git a/modules/js/perf/perf_helpfunc.js b/modules/js/perf/perf_helpfunc.js
index c2ad7f2e0f..45cdd1c681 100644
--- a/modules/js/perf/perf_helpfunc.js
+++ b/modules/js/perf/perf_helpfunc.js
@@ -1,4 +1,4 @@
-const isNodeJs = (typeof window) === 'undefined'? true : false;
+var isNodeJs = (typeof window) === 'undefined'? true : false;
 
 if(isNodeJs) {
   var Base = require("./base");
diff --git a/modules/js/perf/perf_imgproc/perf_blur.js b/modules/js/perf/perf_imgproc/perf_blur.js
index 66c5f240e7..1f3981c132 100644
--- a/modules/js/perf/perf_imgproc/perf_blur.js
+++ b/modules/js/perf/perf_imgproc/perf_blur.js
@@ -1,4 +1,4 @@
-const isNodeJs = (typeof window) === 'undefined'? true : false;
+var isNodeJs = (typeof window) === 'undefined'? true : false;
 
 if　(isNodeJs)　{
   var Benchmark = require('benchmark');
diff --git a/modules/js/perf/perf_imgproc/perf_cvtcolor.js b/modules/js/perf/perf_imgproc/perf_cvtcolor.js
index fbae5d1bca..a72236e5bc 100644
--- a/modules/js/perf/perf_imgproc/perf_cvtcolor.js
+++ b/modules/js/perf/perf_imgproc/perf_cvtcolor.js
@@ -1,4 +1,4 @@
-const isNodeJs = (typeof window) === 'undefined'? true : false;
+var isNodeJs = (typeof window) === 'undefined'? true : false;
 
 if (isNodeJs) {
   var Benchmark = require('benchmark');
diff --git a/modules/js/perf/perf_imgproc/perf_dilate.js b/modules/js/perf/perf_imgproc/perf_dilate.js
index 5b6cd01682..5647cc3acd 100644
--- a/modules/js/perf/perf_imgproc/perf_dilate.js
+++ b/modules/js/perf/perf_imgproc/perf_dilate.js
@@ -1,4 +1,4 @@
-const isNodeJs = (typeof window) === 'undefined'? true : false;
+var isNodeJs = (typeof window) === 'undefined'? true : false;
 
 if　(isNodeJs)　{
   var Benchmark = require('benchmark');
diff --git a/modules/js/perf/perf_imgproc/perf_erode.js b/modules/js/perf/perf_imgproc/perf_erode.js
index 8915ead40f..3edffbf534 100644
--- a/modules/js/perf/perf_imgproc/perf_erode.js
+++ b/modules/js/perf/perf_imgproc/perf_erode.js
@@ -1,4 +1,4 @@
-const isNodeJs = (typeof window) === 'undefined'? true : false;
+var isNodeJs = (typeof window) === 'undefined'? true : false;
 
 if　(isNodeJs)　{
   var Benchmark = require('benchmark');
diff --git a/modules/js/perf/perf_imgproc/perf_filter2D.js b/modules/js/perf/perf_imgproc/perf_filter2D.js
index 4602befcbd..1a2169cae3 100644
--- a/modules/js/perf/perf_imgproc/perf_filter2D.js
+++ b/modules/js/perf/perf_imgproc/perf_filter2D.js
@@ -1,4 +1,4 @@
-const isNodeJs = (typeof window) === 'undefined'? true : false;
+var isNodeJs = (typeof window) === 'undefined'? true : false;
 
 if　(isNodeJs)　{
   var Benchmark = require('benchmark');
diff --git a/modules/js/perf/perf_imgproc/perf_gaussianBlur.js b/modules/js/perf/perf_imgproc/perf_gaussianBlur.js
index b59aa83b84..5643fe4e6f 100644
--- a/modules/js/perf/perf_imgproc/perf_gaussianBlur.js
+++ b/modules/js/perf/perf_imgproc/perf_gaussianBlur.js
@@ -1,4 +1,4 @@
-const isNodeJs = (typeof window) === 'undefined'? true : false;
+var isNodeJs = (typeof window) === 'undefined'? true : false;
 
 if　(isNodeJs)　{
   var Benchmark = require('benchmark');
diff --git a/modules/js/perf/perf_imgproc/perf_medianBlur.js b/modules/js/perf/perf_imgproc/perf_medianBlur.js
index 333bc8424c..29ff99663a 100644
--- a/modules/js/perf/perf_imgproc/perf_medianBlur.js
+++ b/modules/js/perf/perf_imgproc/perf_medianBlur.js
@@ -1,4 +1,4 @@
-const isNodeJs = (typeof window) === 'undefined'? true : false;
+var isNodeJs = (typeof window) === 'undefined'? true : false;
 
 if　(isNodeJs)　{
   var Benchmark = require('benchmark');
diff --git a/modules/js/perf/perf_imgproc/perf_pyrDown.js b/modules/js/perf/perf_imgproc/perf_pyrDown.js
index 957ac7684d..df200e4f19 100644
--- a/modules/js/perf/perf_imgproc/perf_pyrDown.js
+++ b/modules/js/perf/perf_imgproc/perf_pyrDown.js
@@ -1,4 +1,4 @@
-const isNodeJs = (typeof window) === 'undefined'? true : false;
+var isNodeJs = (typeof window) === 'undefined'? true : false;
 
 if　(isNodeJs)　{
   var Benchmark = require('benchmark');
diff --git a/modules/js/perf/perf_imgproc/perf_remap.js b/modules/js/perf/perf_imgproc/perf_remap.js
index 1aa69ecef7..38afef575a 100644
--- a/modules/js/perf/perf_imgproc/perf_remap.js
+++ b/modules/js/perf/perf_imgproc/perf_remap.js
@@ -1,4 +1,4 @@
-const isNodeJs = (typeof window) === 'undefined'? true : false;
+var isNodeJs = (typeof window) === 'undefined'? true : false;
 
 if　(isNodeJs)　{
   var Benchmark = require('benchmark');
diff --git a/modules/js/perf/perf_imgproc/perf_resize.js b/modules/js/perf/perf_imgproc/perf_resize.js
index 5262d22489..ad2b949ed2 100644
--- a/modules/js/perf/perf_imgproc/perf_resize.js
+++ b/modules/js/perf/perf_imgproc/perf_resize.js
@@ -1,4 +1,4 @@
-const isNodeJs = (typeof window) === 'undefined'? true : false;
+var isNodeJs = (typeof window) === 'undefined'? true : false;
 
 if　(isNodeJs)　{
   var Benchmark = require('benchmark');
diff --git a/modules/js/perf/perf_imgproc/perf_scharr.js b/modules/js/perf/perf_imgproc/perf_scharr.js
index 4726e76312..f4df15db55 100644
--- a/modules/js/perf/perf_imgproc/perf_scharr.js
+++ b/modules/js/perf/perf_imgproc/perf_scharr.js
@@ -1,4 +1,4 @@
-const isNodeJs = (typeof window) === 'undefined'? true : false;
+var isNodeJs = (typeof window) === 'undefined'? true : false;
 
 if　(isNodeJs)　{
   var Benchmark = require('benchmark');
diff --git a/modules/js/perf/perf_imgproc/perf_sobel.js b/modules/js/perf/perf_imgproc/perf_sobel.js
index ddc09bb8f6..a082cd7b30 100644
--- a/modules/js/perf/perf_imgproc/perf_sobel.js
+++ b/modules/js/perf/perf_imgproc/perf_sobel.js
@@ -1,4 +1,4 @@
-const isNodeJs = (typeof window) === 'undefined'? true : false;
+var isNodeJs = (typeof window) === 'undefined'? true : false;
 
 if　(isNodeJs)　{
   var Benchmark = require('benchmark');
diff --git a/modules/js/perf/perf_imgproc/perf_threshold.js b/modules/js/perf/perf_imgproc/perf_threshold.js
index 629628748d..71f55257a3 100644
--- a/modules/js/perf/perf_imgproc/perf_threshold.js
+++ b/modules/js/perf/perf_imgproc/perf_threshold.js
@@ -1,4 +1,4 @@
-const isNodeJs = (typeof window) === 'undefined'? true : false;
+var isNodeJs = (typeof window) === 'undefined'? true : false;
 
 if　(isNodeJs)　{
   var Benchmark = require('benchmark');
diff --git a/modules/js/perf/perf_imgproc/perf_warpAffine.js b/modules/js/perf/perf_imgproc/perf_warpAffine.js
index dc3cf67af4..3917719869 100644
--- a/modules/js/perf/perf_imgproc/perf_warpAffine.js
+++ b/modules/js/perf/perf_imgproc/perf_warpAffine.js
@@ -1,4 +1,4 @@
-const isNodeJs = (typeof window) === 'undefined'? true : false;
+var isNodeJs = (typeof window) === 'undefined'? true : false;
 
 if　(isNodeJs)　{
   var Benchmark = require('benchmark');
diff --git a/modules/js/perf/perf_imgproc/perf_warpPerspective.js b/modules/js/perf/perf_imgproc/perf_warpPerspective.js
index 252729e3f0..1b2e5b777a 100644
--- a/modules/js/perf/perf_imgproc/perf_warpPerspective.js
+++ b/modules/js/perf/perf_imgproc/perf_warpPerspective.js
@@ -1,4 +1,4 @@
-const isNodeJs = (typeof window) === 'undefined'? true : false;
+var isNodeJs = (typeof window) === 'undefined'? true : false;
 
 if　(isNodeJs)　{
   var Benchmark = require('benchmark');

From 938b9e4bb7ff26751c558ba3548e0ee53863ff35 Mon Sep 17 00:00:00 2001
From: Alexander Alekhin <alexander.a.alekhin@gmail.com>
Date: Thu, 18 Jul 2024 05:37:14 +0000
Subject: [PATCH 06/17] cmake: try baseline optimization feature check without
 extra flags first

---
 cmake/OpenCVCompilerOptimizations.cmake |  4 ++--
 cmake/checks/cpu_sse2.cpp               | 16 +++++++++++++++-
 2 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/cmake/OpenCVCompilerOptimizations.cmake b/cmake/OpenCVCompilerOptimizations.cmake
index ff0e40c666..418964ab0a 100644
--- a/cmake/OpenCVCompilerOptimizations.cmake
+++ b/cmake/OpenCVCompilerOptimizations.cmake
@@ -171,7 +171,7 @@ elseif(" ${CMAKE_CXX_FLAGS} " MATCHES " -march=native | -xHost | /QxHost ")
 endif()
 
 if(X86 OR X86_64)
-  ocv_update(CPU_KNOWN_OPTIMIZATIONS "SSE;SSE2;SSE3;SSSE3;SSE4_1;POPCNT;SSE4_2;FP16;FMA3;AVX;AVX2;AVX_512F;AVX512_COMMON;AVX512_KNL;AVX512_KNM;AVX512_SKX;AVX512_CNL;AVX512_CLX;AVX512_ICL")
+  ocv_update(CPU_KNOWN_OPTIMIZATIONS "SSE;SSE2;SSE3;SSSE3;SSE4_1;POPCNT;SSE4_2;AVX;FP16;AVX2;FMA3;AVX_512F;AVX512_COMMON;AVX512_KNL;AVX512_KNM;AVX512_SKX;AVX512_CNL;AVX512_CLX;AVX512_ICL")
 
   ocv_update(CPU_AVX512_COMMON_GROUP "AVX_512F;AVX_512CD")
   ocv_update(CPU_AVX512_KNL_GROUP "AVX512_COMMON;AVX512_KNL_EXTRA")
@@ -445,7 +445,7 @@ macro(ocv_check_compiler_optimization OPT)
       set(_varname "")
       if(CPU_${OPT}_TEST_FILE)
         set(__available 0)
-        if(CPU_BASELINE_DETECT)
+        if(__is_from_baseline OR CPU_BASELINE_DETECT)
           set(_varname "HAVE_CPU_${OPT}_SUPPORT")
           ocv_check_compiler_flag(CXX "${CPU_BASELINE_FLAGS}" "${_varname}" "${CPU_${OPT}_TEST_FILE}")
           if(${_varname})
diff --git a/cmake/checks/cpu_sse2.cpp b/cmake/checks/cpu_sse2.cpp
index 68a69f88cb..2827a1a460 100644
--- a/cmake/checks/cpu_sse2.cpp
+++ b/cmake/checks/cpu_sse2.cpp
@@ -1,2 +1,16 @@
 #include <emmintrin.h>
-int main() { return 0; }
+
+inline __m128i _v128_comgt_epu32(const __m128i& a, const __m128i& b)
+{
+    const __m128i delta = _mm_set1_epi32((int)0x80000000);
+    return _mm_cmpgt_epi32(_mm_xor_si128(a, delta), _mm_xor_si128(b, delta));
+}
+
+int main()
+{
+    __m128i a, b, c;
+    a = _mm_set1_epi32(0x00000000);
+    b = _mm_set1_epi32(0x0000ffff);
+    c = _v128_comgt_epu32(a, b);
+    return 0;
+}

From 2a333a6c86b7fd88bd21fd92e8d828aa6a0b595c Mon Sep 17 00:00:00 2001
From: Daniele Affinita <danieleaffinita2000@gmail.com>
Date: Tue, 30 Jul 2024 13:16:08 +0200
Subject: [PATCH 07/17] Merge pull request #25644 from
 DaniAffCH:blockwise-quantization

[GSoC] dnn: Blockwise quantization support #25644

This PR introduces blockwise quantization in DNN allowing the parsing of ONNX models quantized in blockwise style. In particular it modifies the `Quantize` and `Dequantize` operations. The related PR opencv/opencv_extra#1181 contains the test data.

Additional notes:
- The original quantization issue has been fixed. Previously, for 1D scale and zero-point, the operation applied was  $y = int8(x/s - z)$ instead of $y = int8(x/s + z)$. Note that the operation was already correctly implemented when the scale and zero-point were scalars. The previous implementation failed the ONNX test cases, but now all have passed successfully.  [Reference](https://github.com/onnx/onnx/blob/main/docs/Operators.md#QuantizeLinear)
- the function `block_repeat` broadcasts scale and zero-point to the input shape. It repeats all the elements of a given axis n times. This function generalizes the behavior of `repeat` from the core module which is defined just for 2 axis assuming `Mat` has 2 dimensions. If appropriate and useful, you might consider moving `block_repeat` to the core module.
- Now, the scale and zero-point can be taken as layer inputs. This increases the ONNX layers' coverage and enables us to run the ONNX test cases (previously disabled) being fully compliant with ONNX standards. Since they are now supported, I have enabled the test cases for: `test_dequantizelinear`, `test_dequantizelinear_axis`, `test_dequantizelinear_blocked`, `test_quantizelinear`, `test_quantizelinear_axis`, `test_quantizelinear_blocked` just in CPU backend. All of them pass successfully.

### Pull Request Readiness Checklist

See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request

- [x] I agree to contribute to the project under Apache 2 License.
- [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [x] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [x] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
      Patch to opencv_extra has the same branch name.
- [x] The feature is well documented and sample code can be built with the project CMake
---
 .../dnn/src/int8layers/quantization_utils.cpp | 201 +++++++++++++++---
 modules/dnn/src/onnx/onnx_importer.cpp        |  16 +-
 modules/dnn/test/test_onnx_conformance.cpp    |   2 +
 ...conformance_layer_filter__openvino.inl.hpp |  12 +-
 ...ance_layer_filter__vulkan_denylist.inl.hpp |   6 +
 ...er_filter_opencv_ocl_fp16_denylist.inl.hpp |   8 +-
 ...er_filter_opencv_ocl_fp32_denylist.inl.hpp |   6 +
 ..._conformance_layer_parser_denylist.inl.hpp |   4 -
 8 files changed, 212 insertions(+), 43 deletions(-)

diff --git a/modules/dnn/src/int8layers/quantization_utils.cpp b/modules/dnn/src/int8layers/quantization_utils.cpp
index 146ad68257..4690f68e5f 100644
--- a/modules/dnn/src/int8layers/quantization_utils.cpp
+++ b/modules/dnn/src/int8layers/quantization_utils.cpp
@@ -15,7 +15,10 @@ namespace dnn
 static void broadcast1D2TargetMat(Mat& data, const MatShape& targetShape, int axis)
 {
     // The data is the 1-D scales or zeropoints.
-    CV_Assert(axis >= 0 && targetShape.size() > axis && data.total() == targetShape[axis]);
+    CV_CheckGE(axis, 0, "Quantization axis must be non-negative.");
+    CV_CheckGT((int)targetShape.size(),axis,"Quantization axis must be within the valid range of target shape dimensions.");
+    CV_CheckEQ((int)data.total(), (int)targetShape[axis], "Data total size must match the size of the specified target dimension.");
+
     std::vector<int> broadcast_axes;
     for (int i = 0; i < targetShape.size(); i++)
     {
@@ -35,29 +38,98 @@ static void broadcast1D2TargetMat(Mat& data, const MatShape& targetShape, int ax
     }
 }
 
-static void broadcastScaleAndZeropoint(Mat& scalesMat, Mat& zeropointsMat, const std::vector<float>& scales,
-                                       const std::vector<int>& zeropoints, const MatShape& targetShape, int axis)
+static void block_repeat(InputArray src, const MatShape& srcShape, int axis, int repetitions, OutputArray dst)
 {
-    // broad cast the scales and zeropoint to the input shape.
-    MatShape subTargetShape(targetShape.size(), 1);
-    subTargetShape[axis] = scales.size();
+    CV_Assert(src.getObj() != dst.getObj());
+    CV_Check(axis, axis >= 0 && axis < src.dims(), "Axis out of range");
+    CV_CheckGT(repetitions, 1, "More than one repetition expected");
 
-    zeropointsMat.create(subTargetShape.size(), subTargetShape.data(), CV_32FC1);
-    scalesMat.create(subTargetShape.size(), subTargetShape.data(), CV_32FC1);
+    Mat src_mat = src.getMat();
+    Mat dst_mat;
 
-    const int len = scales.size();
-    // Deep copy the scales and zeropoint data and prevent the original data from being changed.
+    if (src_mat.depth() != CV_32F)
+        src_mat.convertTo(src_mat, CV_32F);
 
-    float * scalePtr = scalesMat.ptr<float>(0);
-    for (int i = 0; i < len; i++)
-        scalePtr[i] = scales[i];
+    MatShape sshape = srcShape;
+    MatShape dshape = srcShape;
+
+    size_t dtype_bytes = src_mat.elemSize();
+    int chunk_size = dtype_bytes;
+    int num_chunks = 1;
+
+    dshape[axis] *= repetitions;
+
+    for (int i = axis+1; i < sshape.size(); ++i)
+        chunk_size*=sshape[i];
+
+    for (int i = 0; i <= axis; ++i)
+        num_chunks*=sshape[i];
+
+    dst.create(dshape.size(), dshape.data(), src_mat.type());
+    dst_mat = dst.getMat();
+
+    CV_Assert(dst_mat.isContinuous());
+    CV_Assert(src_mat.isContinuous());
+
+    for (int i = 0; i < repetitions; ++i) {
+        size_t src_offset = 0;
+        size_t dst_offset = i * chunk_size;
+
+        for (int j = 0; j < num_chunks; ++j) {
+            memcpy(dst_mat.data + dst_offset, src_mat.data + src_offset, chunk_size);
+            src_offset += chunk_size;
+            dst_offset += chunk_size * repetitions;
+        }
+    }
+}
+
+template <typename T>
+static void copyVecToMat(Mat& mat, const std::vector<T>& data){
+    float * matPtr = mat.ptr<float>(0);
+    const int len = data.size();
 
-    float * zpPtr = zeropointsMat.ptr<float>(0);
     for (int i = 0; i < len; i++)
-        zpPtr[i] = (float )zeropoints[i];
+        matPtr[i] = (float) data[i];
+}
 
-    broadcast1D2TargetMat(scalesMat, targetShape, axis);
-    broadcast1D2TargetMat(zeropointsMat, targetShape, axis);
+template <typename T>
+static void broadcastBlockedMatrix(Mat& mat, const std::vector<T>& data, const MatShape& targetShape, int axis, int block_size){
+    CV_Check(block_size, targetShape[axis] % block_size == 0 && block_size <= targetShape[axis], "Block size must be a divisor of the target dimension size and not exceed it.");
+
+    MatShape subTargetShape(targetShape);
+    subTargetShape[axis] = static_cast<int>(subTargetShape[axis] / block_size);
+
+    block_repeat(data, subTargetShape, axis, block_size, mat);
+}
+
+template <typename T>
+static void broadcastStandardMatrix(Mat& mat, const std::vector<T>& data, const MatShape& targetShape, int axis)
+{
+    MatShape subTargetShape(targetShape.size(), 1);
+    subTargetShape[axis] = data.size();
+    mat.create(subTargetShape.size(), subTargetShape.data(), CV_32FC1);
+
+    copyVecToMat(mat,data);
+
+    broadcast1D2TargetMat(mat, targetShape, axis);
+}
+
+
+static void broadcastScaleAndZeropoint(Mat& scalesMat, Mat& zeropointsMat, const std::vector<float>& scales,
+                                       const std::vector<int>& zeropoints, const MatShape& targetShape, int axis, int block_size)
+{
+    // broad cast the scales and zeropoint to the input shape.
+
+    if (block_size == 0)
+    {
+        broadcastStandardMatrix(zeropointsMat, zeropoints, targetShape, axis);
+        broadcastStandardMatrix(scalesMat, scales, targetShape, axis);
+    }
+    else
+    {
+        broadcastBlockedMatrix(zeropointsMat, zeropoints, targetShape, axis, block_size);
+        broadcastBlockedMatrix(scalesMat, scales, targetShape, axis, block_size);
+    }
 }
 
 // Quantize FP32/FP16 Inputs to INT8
@@ -65,13 +137,17 @@ class QuantizeLayerImpl CV_FINAL : public QuantizeLayer
 {
 public:
     int axis;
+    int block_size;
     bool is1D;
-    Mat scalesMat, zeropointsMat; // Saving the broadcasetd scales data.
+    Mat scalesMat, zeropointsMat; // Saving the broadcasted scales data.
+    bool quantParamExternal = true;  // Indicates if the quantization parameters (scale and zero point) are provided as inputs to the node.
 
     QuantizeLayerImpl(const LayerParams& params)
     {
         is1D = params.get<bool>("is1D", false);
         axis = params.get<int>("axis", 1);
+        block_size = params.get<int>("block_size", 0);
+
         if (!is1D)
         {
             scales.push_back(params.get<float>("scales", 1.0f));
@@ -82,7 +158,7 @@ public:
             DictValue paramScales = params.get("scales");
             int i, n = paramScales.size();
 
-            CV_Assert(n > 0);
+            CV_CheckGT(n, 0, "Scale missing.");
             scales.resize(n, 0.);
             for (i = 0; i < n; i++)
                 scales[i] = paramScales.get<float>(i);
@@ -108,7 +184,7 @@ public:
                          std::vector<MatShape> &outputs,
                          std::vector<MatShape> &internals) const CV_OVERRIDE
     {
-        CV_Assert(inputs.size() == 1);
+        CV_Check(inputs.size(), inputs.size() >= 1 && inputs.size() <= 3, "Number of inputs must be between 1 and 3 inclusive.");
         Layer::getMemoryShapes(inputs, requiredOutputs, outputs, internals);
         return false;
     }
@@ -124,7 +200,7 @@ public:
         if (is1D)
         {
             MatShape inputShape = shape(inputs[0]);
-            broadcastScaleAndZeropoint(scalesMat, zeropointsMat, scales, zeropoints, inputShape, axis);
+            broadcastScaleAndZeropoint(scalesMat, zeropointsMat, scales, zeropoints, inputShape, axis, block_size);
         }
     }
 
@@ -146,6 +222,39 @@ public:
         return true;
     }
 #endif
+    void processInputOutput(std::vector<Mat>& inputs, std::vector<Mat>& outputs)
+    {
+        CV_Check(inputs.size(), inputs.size() >= 1 && inputs.size() <= 3, "Number of inputs must be between 1 and 3 inclusive.");
+        quantParamExternal &= inputs.size() > 1;
+
+        // Scale and zeropoint taken as input
+        if (quantParamExternal)
+        {
+            quantParamExternal = false;
+            scalesMat = inputs[1];
+
+            scalesMat.reshape(1, 1).copyTo(scales);
+
+            if(scalesMat.total() > 1) is1D = true;
+
+
+            if (inputs.size() > 2)
+            {
+                zeropointsMat = inputs[2];
+                CV_CheckEQ((int)zeropointsMat.total(), (int)scalesMat.total(), "Scale and zero point elements number must match.");
+                zeropointsMat.reshape(1, 1).copyTo(zeropoints);
+            }
+
+            if (is1D)
+            {
+                MatShape inputShape = shape(inputs[0]);
+                broadcastScaleAndZeropoint(scalesMat, zeropointsMat, scales, zeropoints, inputShape, axis, block_size);
+            }
+        }
+
+        if (outputs[0].depth() != CV_8S)
+            outputs[0].convertTo(outputs[0], CV_8S);
+    }
 
     void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
     {
@@ -159,14 +268,13 @@ public:
         inputs_arr.getMatVector(inputs);
         outputs_arr.getMatVector(outputs);
 
-        if (outputs[0].depth() != CV_8S)
-            outputs[0].convertTo(outputs[0], CV_8S);
+        processInputOutput(inputs, outputs);
 
         if (is1D)
         {
             Mat inputTmp;
             divide(inputs[0], scalesMat, inputTmp);
-            subtract(inputTmp, zeropointsMat, inputTmp);
+            add(inputTmp, zeropointsMat, inputTmp);
 
             inputTmp.convertTo(outputs[0], CV_8S);
         }
@@ -190,13 +298,16 @@ class DequantizeLayerImpl CV_FINAL : public DequantizeLayer
 {
 public:
     int axis;
+    int block_size;
     bool is1D;
     Mat scalesMat, zeropointsMat; // Saving the broadcasetd scales data.
+    bool quantParamExternal = true;
 
     DequantizeLayerImpl(const LayerParams& params)
     {
         is1D = params.get<bool>("is1D", false);
         axis = params.get<int>("axis", 1);
+        block_size = params.get<int>("block_size", 0);
 
         if (!is1D)
         {
@@ -208,7 +319,7 @@ public:
             DictValue paramScales = params.get("scales");
             int i, n = paramScales.size();
 
-            CV_Assert(n > 0);
+            CV_CheckGT(n, 0, "Scale missing.");
             scales.resize(n);
             for (i = 0; i < n; i++)
                 scales[i] = paramScales.get<float>(i);
@@ -234,7 +345,7 @@ public:
                          std::vector<MatShape> &outputs,
                          std::vector<MatShape> &internals) const CV_OVERRIDE
     {
-        CV_Assert(inputs.size() == 1);
+        CV_Check(inputs.size(), inputs.size() >= 1 && inputs.size() <= 3, "Number of inputs must be between 1 and 3 inclusive.");
         Layer::getMemoryShapes(inputs, requiredOutputs, outputs, internals);
         return false;
     }
@@ -250,7 +361,7 @@ public:
         if (is1D)
         {
             MatShape inputShape = shape(inputs[0]);
-            broadcastScaleAndZeropoint(scalesMat, zeropointsMat, scales, zeropoints, inputShape, axis);
+            broadcastScaleAndZeropoint(scalesMat, zeropointsMat, scales, zeropoints, inputShape, axis, block_size);
         }
     }
 
@@ -269,6 +380,39 @@ public:
     }
 #endif
 
+    void processInputOutput(std::vector<Mat>& inputs, std::vector<Mat>& outputs)
+    {
+        CV_Check(inputs.size(), inputs.size() >= 1 && inputs.size() <= 3, "Number of inputs must be between 1 and 3 inclusive.");
+
+        quantParamExternal &= inputs.size() > 1;
+        // Scale and zeropoint taken as input
+        if (quantParamExternal)
+        {
+            quantParamExternal = false;
+            scalesMat = inputs[1];
+
+            scalesMat.reshape(1, 1).copyTo(scales);
+
+            if(scalesMat.total() > 1) is1D = true;
+
+            if (inputs.size() > 2)
+            {
+                zeropointsMat = inputs[2];
+                CV_CheckEQ((int)zeropointsMat.total(), (int)scalesMat.total(), "Scale and zero point elements number must match.");
+                zeropointsMat.reshape(1, 1).copyTo(zeropoints);
+            }
+
+            if (is1D)
+            {
+                MatShape inputShape = shape(inputs[0]);
+                broadcastScaleAndZeropoint(scalesMat, zeropointsMat, scales, zeropoints, inputShape, axis, block_size);
+            }
+        }
+
+        if (outputs[0].depth() != CV_32F)
+            outputs[0].convertTo(outputs[0], CV_32F);
+    }
+
     void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
     {
         CV_TRACE_FUNCTION();
@@ -281,8 +425,7 @@ public:
         inputs_arr.getMatVector(inputs);
         outputs_arr.getMatVector(outputs);
 
-        if (outputs[0].depth() != CV_32F)
-            outputs[0].convertTo(outputs[0], CV_32F);
+        processInputOutput(inputs, outputs);
 
         if (is1D)
         {
diff --git a/modules/dnn/src/onnx/onnx_importer.cpp b/modules/dnn/src/onnx/onnx_importer.cpp
index 3745d7ed86..e91e2605c5 100644
--- a/modules/dnn/src/onnx/onnx_importer.cpp
+++ b/modules/dnn/src/onnx/onnx_importer.cpp
@@ -3239,6 +3239,17 @@ void ONNXImporter::parseQuantDequant(LayerParams& layerParams, const opencv_onnx
     // or 1-D tensor (per-channel quantized).
     bool is1D = false;
 
+    if (layerParams.type == "Quantize")
+        layerParams.set("depth", CV_8S);
+    else // Dequantize
+        layerParams.set("depth", CV_32F);
+
+    // If scale is not defined as a constant blob, it is considered an external input.
+    if(constBlobs.find(node_proto.input(1)) == constBlobs.end()){
+        addLayer(layerParams, node_proto);
+        return;
+    }
+
     Mat scaleMat = getBlob(node_proto, 1);
     if(scaleMat.total() > 1) is1D = true;
 
@@ -3280,11 +3291,6 @@ void ONNXImporter::parseQuantDequant(LayerParams& layerParams, const opencv_onnx
         layerParams.set("zeropoints", zeropoint);
     }
 
-    if (layerParams.type == "Quantize")
-        layerParams.set("depth", CV_8S);
-    else // Dequantize
-        layerParams.set("depth", CV_32F);
-
     if (constBlobs.find(node_proto.input(0)) != constBlobs.end()) // Variable input.
     {
         std::vector<Mat> inputs, outputs;
diff --git a/modules/dnn/test/test_onnx_conformance.cpp b/modules/dnn/test/test_onnx_conformance.cpp
index 57969ced87..0199d29548 100644
--- a/modules/dnn/test/test_onnx_conformance.cpp
+++ b/modules/dnn/test/test_onnx_conformance.cpp
@@ -224,6 +224,7 @@ static const TestCase testConformanceConfig[] = {
     {"test_depthtospace_example", 1, 1},
     {"test_dequantizelinear", 3, 1},
     {"test_dequantizelinear_axis", 3, 1},
+    {"test_dequantizelinear_blocked", 3, 1},
     {"test_det_2d", 1, 1},
     {"test_det_nd", 1, 1},
     {"test_div", 2, 1},
@@ -569,6 +570,7 @@ static const TestCase testConformanceConfig[] = {
     {"test_qlinearmatmul_3D", 8, 1},
     {"test_quantizelinear", 3, 1},
     {"test_quantizelinear_axis", 3, 1},
+    {"test_quantizelinear_blocked", 3, 1},
     {"test_range_float_type_positive_delta", 3, 1},
     {"test_range_float_type_positive_delta_expanded", 3, 1},
     {"test_range_int32_type_negative_delta", 3, 1},
diff --git a/modules/dnn/test/test_onnx_conformance_layer_filter__openvino.inl.hpp b/modules/dnn/test/test_onnx_conformance_layer_filter__openvino.inl.hpp
index 9b2a2f4f2d..9069a69ff4 100644
--- a/modules/dnn/test/test_onnx_conformance_layer_filter__openvino.inl.hpp
+++ b/modules/dnn/test/test_onnx_conformance_layer_filter__openvino.inl.hpp
@@ -565,9 +565,11 @@ CASE(test_depthtospace_dcr_mode)
 CASE(test_depthtospace_example)
     // no filter
 CASE(test_dequantizelinear)
-    // no filter
+    SKIP;
 CASE(test_dequantizelinear_axis)
-    // no filter
+    SKIP;
+CASE(test_dequantizelinear_blocked)
+    SKIP;
 CASE(test_det_2d)
     // no filter
 CASE(test_det_nd)
@@ -1348,9 +1350,11 @@ CASE(test_qlinearmatmul_2D)
 CASE(test_qlinearmatmul_3D)
     // no filter
 CASE(test_quantizelinear)
-    // no filter
+    SKIP;
 CASE(test_quantizelinear_axis)
-    // no filter
+    SKIP;
+CASE(test_quantizelinear_blocked)
+    SKIP;
 CASE(test_range_float_type_positive_delta)
     // no filter
 CASE(test_range_float_type_positive_delta_expanded)
diff --git a/modules/dnn/test/test_onnx_conformance_layer_filter__vulkan_denylist.inl.hpp b/modules/dnn/test/test_onnx_conformance_layer_filter__vulkan_denylist.inl.hpp
index 968dd1e025..f6aee0dd36 100644
--- a/modules/dnn/test/test_onnx_conformance_layer_filter__vulkan_denylist.inl.hpp
+++ b/modules/dnn/test/test_onnx_conformance_layer_filter__vulkan_denylist.inl.hpp
@@ -48,6 +48,9 @@
 "test_cumsum_2d_axis_1",
 "test_cumsum_2d_negative_axis",
 "test_concat_1d_axis_negative_1",
+"test_dequantizelinear",
+"test_dequantizelinear_axis",
+"test_dequantizelinear_blocked",
 "test_div_uint8",
 "test_flatten_axis0",
 "test_flatten_axis2",
@@ -71,6 +74,9 @@
 "test_pow_types_float32_int32", // vulkan backend does not take tensor other than float32 data type
 "test_pow_types_float32_int64", // vulkan backend does not take tensor other than float32 data type
 "test_pow_types_int", // vulkan backend does not take tensor other than float32 data type
+"test_quantizelinear",
+"test_quantizelinear_axis",
+"test_quantizelinear_blocked",
 "test_softmax_default_axis",
 "test_sub_bcast",
 "test_sub_uint8",
diff --git a/modules/dnn/test/test_onnx_conformance_layer_filter_opencv_ocl_fp16_denylist.inl.hpp b/modules/dnn/test/test_onnx_conformance_layer_filter_opencv_ocl_fp16_denylist.inl.hpp
index 7303348d10..8dc970fe1e 100644
--- a/modules/dnn/test/test_onnx_conformance_layer_filter_opencv_ocl_fp16_denylist.inl.hpp
+++ b/modules/dnn/test/test_onnx_conformance_layer_filter_opencv_ocl_fp16_denylist.inl.hpp
@@ -1,4 +1,7 @@
 "test_averagepool_3d_default",
+"test_dequantizelinear",
+"test_dequantizelinear_axis",
+"test_dequantizelinear_blocked",
 "test_dropout_default_ratio",
 "test_globalmaxpool",
 "test_globalmaxpool_precomputed",
@@ -14,7 +17,10 @@
 "test_maxpool_2d_same_upper",
 "test_maxpool_2d_strides",
 "test_maxpool_3d_default",
-"test_pow", // fp16 accuracy issue
+"test_pow",
+"test_quantizelinear",
+"test_quantizelinear_axis",
+"test_quantizelinear_blocked",
 "test_softmax_large_number",
 "test_softmax_large_number_expanded",
 "test_split_equal_parts_1d",
diff --git a/modules/dnn/test/test_onnx_conformance_layer_filter_opencv_ocl_fp32_denylist.inl.hpp b/modules/dnn/test/test_onnx_conformance_layer_filter_opencv_ocl_fp32_denylist.inl.hpp
index 7fe58a07fd..2453e2ad9f 100644
--- a/modules/dnn/test/test_onnx_conformance_layer_filter_opencv_ocl_fp32_denylist.inl.hpp
+++ b/modules/dnn/test/test_onnx_conformance_layer_filter_opencv_ocl_fp32_denylist.inl.hpp
@@ -1,5 +1,11 @@
 "test_averagepool_3d_default",
+"test_dequantizelinear",
+"test_dequantizelinear_axis",
+"test_dequantizelinear_blocked",
 "test_maxpool_3d_default",
+"test_quantizelinear",
+"test_quantizelinear_axis",
+"test_quantizelinear_blocked",
 "test_scatter_elements_with_axis",
 "test_scatter_elements_with_duplicate_indices",
 "test_scatter_elements_with_negative_indices",
diff --git a/modules/dnn/test/test_onnx_conformance_layer_parser_denylist.inl.hpp b/modules/dnn/test/test_onnx_conformance_layer_parser_denylist.inl.hpp
index 78c26eeea2..7b408619d2 100644
--- a/modules/dnn/test/test_onnx_conformance_layer_parser_denylist.inl.hpp
+++ b/modules/dnn/test/test_onnx_conformance_layer_parser_denylist.inl.hpp
@@ -89,8 +89,6 @@
 "test_convtranspose_pad",
 "test_convtranspose_pads",
 "test_convtranspose_with_kernel",
-"test_dequantizelinear",
-"test_dequantizelinear_axis",
 "test_det_2d",
 "test_det_nd",
 "test_dropout_default_mask",
@@ -290,8 +288,6 @@
 "test_qlinearconv",
 "test_qlinearmatmul_2D",
 "test_qlinearmatmul_3D",
-"test_quantizelinear",
-"test_quantizelinear_axis",
 "test_range_float_type_positive_delta",
 "test_range_float_type_positive_delta_expanded",
 "test_range_int32_type_negative_delta",

From 93745245a362aea9a57f6c5f767f88ea503985fe Mon Sep 17 00:00:00 2001
From: Alexander Smorkalov <alexander.smorkalov@xperience.ai>
Date: Wed, 31 Jul 2024 18:05:33 +0300
Subject: [PATCH 08/17] Improved error handling in image codecs.

---
 modules/imgcodecs/src/bitstrm.cpp      | 54 ++++++++++++++++++--------
 modules/imgcodecs/src/bitstrm.hpp      | 20 ++++++----
 modules/imgcodecs/src/grfmt_bmp.cpp    | 38 +++++++++---------
 modules/imgcodecs/src/grfmt_pfm.cpp    | 34 ++++++++--------
 modules/imgcodecs/src/grfmt_pxm.cpp    |  8 ++--
 modules/imgcodecs/src/grfmt_sunras.cpp | 18 ++++-----
 modules/imgcodecs/src/grfmt_tiff.cpp   | 10 -----
 modules/imgcodecs/src/grfmt_tiff.hpp   |  4 --
 8 files changed, 99 insertions(+), 87 deletions(-)

diff --git a/modules/imgcodecs/src/bitstrm.cpp b/modules/imgcodecs/src/bitstrm.cpp
index 97df645a6d..a8f91aa4dd 100644
--- a/modules/imgcodecs/src/bitstrm.cpp
+++ b/modules/imgcodecs/src/bitstrm.cpp
@@ -377,26 +377,30 @@ void  WBaseStream::allocate()
 }
 
 
-void  WBaseStream::writeBlock()
+bool  WBaseStream::writeBlock()
 {
     int size = (int)(m_current - m_start);
 
     CV_Assert(isOpened());
     if( size == 0 )
-        return;
+        return true;
 
     if( m_buf )
     {
         size_t sz = m_buf->size();
         m_buf->resize( sz + size );
         memcpy( &(*m_buf)[sz], m_start, size );
+        m_current = m_start;
+        m_block_pos += size;
+        return true;
     }
     else
     {
-        fwrite( m_start, 1, size, m_file );
+        size_t written = fwrite( m_start, 1, size, m_file );
+        m_current = m_start;
+        m_block_pos += size;
+        return written == (size_t)size;
     }
-    m_current = m_start;
-    m_block_pos += size;
 }
 
 
@@ -463,15 +467,17 @@ WLByteStream::~WLByteStream()
 {
 }
 
-void WLByteStream::putByte( int val )
+bool  WLByteStream::putByte( int val )
 {
     *m_current++ = (uchar)val;
     if( m_current >= m_end )
-        writeBlock();
+        return writeBlock();
+
+    return true;
 }
 
 
-void WLByteStream::putBytes( const void* buffer, int count )
+bool  WLByteStream::putBytes( const void* buffer, int count )
 {
     uchar* data = (uchar*)buffer;
 
@@ -492,12 +498,18 @@ void WLByteStream::putBytes( const void* buffer, int count )
             count -= l;
         }
         if( m_current == m_end )
-            writeBlock();
+        {
+            bool written = writeBlock();
+            if (!written)
+                return false;
+        }
     }
+
+    return true;
 }
 
 
-void WLByteStream::putWord( int val )
+bool  WLByteStream::putWord( int val )
 {
     uchar *current = m_current;
 
@@ -507,17 +519,19 @@ void WLByteStream::putWord( int val )
         current[1] = (uchar)(val >> 8);
         m_current = current + 2;
         if( m_current == m_end )
-            writeBlock();
+            return writeBlock();
     }
     else
     {
         putByte(val);
         putByte(val >> 8);
     }
+
+    return true;
 }
 
 
-void WLByteStream::putDWord( int val )
+bool  WLByteStream::putDWord( int val )
 {
     uchar *current = m_current;
 
@@ -529,7 +543,7 @@ void WLByteStream::putDWord( int val )
         current[3] = (uchar)(val >> 24);
         m_current = current + 4;
         if( m_current == m_end )
-            writeBlock();
+            return writeBlock();
     }
     else
     {
@@ -538,6 +552,8 @@ void WLByteStream::putDWord( int val )
         putByte(val >> 16);
         putByte(val >> 24);
     }
+
+    return true;
 }
 
 
@@ -548,7 +564,7 @@ WMByteStream::~WMByteStream()
 }
 
 
-void WMByteStream::putWord( int val )
+bool  WMByteStream::putWord( int val )
 {
     uchar *current = m_current;
 
@@ -558,17 +574,19 @@ void WMByteStream::putWord( int val )
         current[1] = (uchar)val;
         m_current = current + 2;
         if( m_current == m_end )
-            writeBlock();
+            return writeBlock();
     }
     else
     {
         putByte(val >> 8);
         putByte(val);
     }
+
+    return true;
 }
 
 
-void WMByteStream::putDWord( int val )
+bool  WMByteStream::putDWord( int val )
 {
     uchar *current = m_current;
 
@@ -580,7 +598,7 @@ void WMByteStream::putDWord( int val )
         current[3] = (uchar)val;
         m_current = current + 4;
         if( m_current == m_end )
-            writeBlock();
+            return writeBlock();
     }
     else
     {
@@ -589,6 +607,8 @@ void WMByteStream::putDWord( int val )
         putByte(val >> 8);
         putByte(val);
     }
+
+    return true;
 }
 
 }
diff --git a/modules/imgcodecs/src/bitstrm.hpp b/modules/imgcodecs/src/bitstrm.hpp
index 26947971f3..ebffb91f10 100644
--- a/modules/imgcodecs/src/bitstrm.hpp
+++ b/modules/imgcodecs/src/bitstrm.hpp
@@ -63,6 +63,12 @@ DECLARE_RBS_EXCEPTION(THROW_FORB)
 DECLARE_RBS_EXCEPTION(BAD_HEADER)
 #define RBS_BAD_HEADER RBS_BAD_HEADER_Exception(cv::Error::StsError, "Invalid header", CV_Func, __FILE__, __LINE__)
 
+#define CHECK_WRITE(action) \
+if (!action) \
+{ \
+    return false; \
+}
+
 typedef unsigned long ulong;
 
 // class RBaseStream - base class for other reading streams.
@@ -147,7 +153,7 @@ protected:
     bool    m_is_opened;
     std::vector<uchar>* m_buf;
 
-    virtual void  writeBlock();
+    virtual bool  writeBlock();
     virtual void  release();
     virtual void  allocate();
 };
@@ -160,10 +166,10 @@ class WLByteStream : public WBaseStream
 public:
     virtual ~WLByteStream();
 
-    void  putByte( int val );
-    void  putBytes( const void* buffer, int count );
-    void  putWord( int val );
-    void  putDWord( int val );
+    bool putByte( int val );
+    bool putBytes( const void* buffer, int count );
+    bool putWord( int val );
+    bool putDWord( int val );
 };
 
 
@@ -173,8 +179,8 @@ class WMByteStream : public WLByteStream
 {
 public:
     virtual ~WMByteStream();
-    void  putWord( int val );
-    void  putDWord( int val );
+    bool putWord( int val );
+    bool putDWord( int val );
 };
 
 inline unsigned BSWAP(unsigned v)
diff --git a/modules/imgcodecs/src/grfmt_bmp.cpp b/modules/imgcodecs/src/grfmt_bmp.cpp
index 91ef23cc3f..e69a93c78b 100644
--- a/modules/imgcodecs/src/grfmt_bmp.cpp
+++ b/modules/imgcodecs/src/grfmt_bmp.cpp
@@ -635,38 +635,40 @@ bool  BmpEncoder::write( const Mat& img, const std::vector<int>& )
         m_buf->reserve( alignSize(fileSize + 16, 256) );
 
     // write signature 'BM'
-    strm.putBytes( fmtSignBmp, (int)strlen(fmtSignBmp) );
+    CHECK_WRITE(strm.putBytes( fmtSignBmp, (int)strlen(fmtSignBmp) ));
 
     // write file header
-    strm.putDWord( validateToInt(fileSize) ); // file size
-    strm.putDWord( 0 );
-    strm.putDWord( headerSize );
+    CHECK_WRITE(strm.putDWord( validateToInt(fileSize) )); // file size
+    CHECK_WRITE(strm.putDWord( 0 ));
+    CHECK_WRITE(strm.putDWord( headerSize ));
 
     // write bitmap header
-    strm.putDWord( bitmapHeaderSize );
-    strm.putDWord( width );
-    strm.putDWord( height );
-    strm.putWord( 1 );
-    strm.putWord( channels << 3 );
-    strm.putDWord( BMP_RGB );
-    strm.putDWord( 0 );
-    strm.putDWord( 0 );
-    strm.putDWord( 0 );
-    strm.putDWord( 0 );
-    strm.putDWord( 0 );
+    CHECK_WRITE(strm.putDWord( bitmapHeaderSize ));
+    CHECK_WRITE(strm.putDWord( width ));
+    CHECK_WRITE(strm.putDWord( height ));
+    CHECK_WRITE(strm.putWord( 1 ));
+    CHECK_WRITE(strm.putWord( channels << 3 ));
+    CHECK_WRITE(strm.putDWord( BMP_RGB ));
+    CHECK_WRITE(strm.putDWord( 0 ));
+    CHECK_WRITE(strm.putDWord( 0 ));
+    CHECK_WRITE(strm.putDWord( 0 ));
+    CHECK_WRITE(strm.putDWord( 0 ));
+    CHECK_WRITE(strm.putDWord( 0 ));
 
     if( channels == 1 )
     {
         FillGrayPalette( palette, 8 );
-        strm.putBytes( palette, sizeof(palette));
+        CHECK_WRITE(strm.putBytes( palette, sizeof(palette)));
     }
 
     width *= channels;
     for( int y = height - 1; y >= 0; y-- )
     {
-        strm.putBytes( img.ptr(y), width );
+        CHECK_WRITE(strm.putBytes( img.ptr(y), width ));
         if( fileStep > width )
-            strm.putBytes( zeropad, fileStep - width );
+        {
+            CHECK_WRITE(strm.putBytes( zeropad, fileStep - width ));
+        }
     }
 
     strm.close();
diff --git a/modules/imgcodecs/src/grfmt_pfm.cpp b/modules/imgcodecs/src/grfmt_pfm.cpp
index b213d18fde..61cab06714 100644
--- a/modules/imgcodecs/src/grfmt_pfm.cpp
+++ b/modules/imgcodecs/src/grfmt_pfm.cpp
@@ -64,11 +64,11 @@ T read_number(cv::RLByteStream& strm)
   return atoT<T>(str);
 }
 
-template<typename T> void write_anything(cv::WLByteStream& strm, const T& t)
+template<typename T> bool write_anything(cv::WLByteStream& strm, const T& t)
 {
   std::ostringstream ss;
   ss << t;
-  strm.putBytes(ss.str().c_str(), static_cast<int>(ss.str().size()));
+  return strm.putBytes(ss.str().c_str(), static_cast<int>(ss.str().size()));
 }
 
 }
@@ -206,33 +206,33 @@ bool PFMEncoder::write(const Mat& img, const std::vector<int>& params)
   }
 
   Mat float_img;
-  strm.putByte('P');
+  CHECK_WRITE(strm.putByte('P'));
   switch (img.channels()) {
   case 1:
-    strm.putByte('f');
+    CHECK_WRITE(strm.putByte('f'));
     img.convertTo(float_img, CV_32FC1);
     break;
   case 3:
-    strm.putByte('F');
+    CHECK_WRITE(strm.putByte('F'));
     img.convertTo(float_img, CV_32FC3);
     break;
   default:
     CV_Error(Error::StsBadArg, "Expected 1 or 3 channel image.");
   }
-  strm.putByte('\n');
+  CHECK_WRITE(strm.putByte('\n'));
 
 
-  write_anything(strm, float_img.cols);
-  strm.putByte(' ');
-  write_anything(strm, float_img.rows);
-  strm.putByte('\n');
+  CHECK_WRITE(write_anything(strm, float_img.cols));
+  CHECK_WRITE(strm.putByte(' '));
+  CHECK_WRITE(write_anything(strm, float_img.rows));
+  CHECK_WRITE(strm.putByte('\n'));
 #ifdef WORDS_BIGENDIAN
-  write_anything(strm, 1.0);
+  CHECK_WRITE(write_anything(strm, 1.0));
 #else
-  write_anything(strm, -1.0);
+  CHECK_WRITE(write_anything(strm, -1.0));
 #endif
 
-  strm.putByte('\n');
+  CHECK_WRITE(strm.putByte('\n'));
 
   // Comments are not officially supported in this file format.
   // write_anything(strm, "# Generated by OpenCV " CV_VERSION "\n");
@@ -248,17 +248,15 @@ bool PFMEncoder::write(const Mat& img, const std::vector<int>& params)
         rgb_row[x*3+1] = bgr_row[x*3+1];
         rgb_row[x*3+2] = bgr_row[x*3+0];
       }
-      strm.putBytes( reinterpret_cast<const uchar*>(rgb_row.data()),
-                     static_cast<int>(sizeof(float) * row_size) );
+      CHECK_WRITE(strm.putBytes( reinterpret_cast<const uchar*>(rgb_row.data()),
+                     static_cast<int>(sizeof(float) * row_size) ));
     } else if (float_img.channels() == 1) {
-      strm.putBytes(float_img.ptr(y), sizeof(float) * float_img.cols);
+      CHECK_WRITE(strm.putBytes(float_img.ptr(y), sizeof(float) * float_img.cols));
     }
   }
   return true;
 }
 
-
 }
 
-
 #endif // HAVE_IMGCODEC_PFM
diff --git a/modules/imgcodecs/src/grfmt_pxm.cpp b/modules/imgcodecs/src/grfmt_pxm.cpp
index d2ce60c743..20c815e833 100644
--- a/modules/imgcodecs/src/grfmt_pxm.cpp
+++ b/modules/imgcodecs/src/grfmt_pxm.cpp
@@ -479,7 +479,7 @@ bool PxMEncoder::write(const Mat& img, const std::vector<int>& params)
         header_sz += sz;
     }
 
-    strm.putBytes(buffer, header_sz);
+    CHECK_WRITE(strm.putBytes(buffer, header_sz));
 
     for( y = 0; y < height; y++ )
     {
@@ -512,7 +512,7 @@ bool PxMEncoder::write(const Mat& img, const std::vector<int>& params)
                 {
                     *ptr++ = byte;
                 }
-                strm.putBytes(buffer, (int)(ptr - buffer));
+                CHECK_WRITE(strm.putBytes(buffer, (int)(ptr - buffer)));
                 continue;
             }
 
@@ -539,7 +539,7 @@ bool PxMEncoder::write(const Mat& img, const std::vector<int>& params)
                 }
             }
 
-            strm.putBytes( (channels > 1 || depth > 8) ? buffer : (const char*)data, fileStep);
+            CHECK_WRITE(strm.putBytes( (channels > 1 || depth > 8) ? buffer : (const char*)data, fileStep));
         }
         else
         {
@@ -610,7 +610,7 @@ bool PxMEncoder::write(const Mat& img, const std::vector<int>& params)
 
             *ptr++ = '\n';
 
-            strm.putBytes( buffer, (int)(ptr - buffer) );
+            CHECK_WRITE(strm.putBytes( buffer, (int)(ptr - buffer) ));
         }
     }
 
diff --git a/modules/imgcodecs/src/grfmt_sunras.cpp b/modules/imgcodecs/src/grfmt_sunras.cpp
index 798f295376..852e735477 100644
--- a/modules/imgcodecs/src/grfmt_sunras.cpp
+++ b/modules/imgcodecs/src/grfmt_sunras.cpp
@@ -410,17 +410,17 @@ bool  SunRasterEncoder::write( const Mat& img, const std::vector<int>& )
 
     if( strm.open(m_filename) )
     {
-        strm.putBytes( fmtSignSunRas, (int)strlen(fmtSignSunRas) );
-        strm.putDWord( width );
-        strm.putDWord( height );
-        strm.putDWord( channels*8 );
-        strm.putDWord( fileStep*height );
-        strm.putDWord( RAS_STANDARD );
-        strm.putDWord( RMT_NONE );
-        strm.putDWord( 0 );
+        CHECK_WRITE(strm.putBytes( fmtSignSunRas, (int)strlen(fmtSignSunRas) ));
+        CHECK_WRITE(strm.putDWord( width ));
+        CHECK_WRITE(strm.putDWord( height ));
+        CHECK_WRITE(strm.putDWord( channels*8 ));
+        CHECK_WRITE(strm.putDWord( fileStep*height ));
+        CHECK_WRITE(strm.putDWord( RAS_STANDARD ));
+        CHECK_WRITE(strm.putDWord( RMT_NONE ));
+        CHECK_WRITE(strm.putDWord( 0 ));
 
         for( y = 0; y < height; y++ )
-            strm.putBytes( img.ptr(y), fileStep );
+            CHECK_WRITE(strm.putBytes( img.ptr(y), fileStep ));
 
         strm.close();
         result = true;
diff --git a/modules/imgcodecs/src/grfmt_tiff.cpp b/modules/imgcodecs/src/grfmt_tiff.cpp
index f68a6e5c0d..e2184663aa 100644
--- a/modules/imgcodecs/src/grfmt_tiff.cpp
+++ b/modules/imgcodecs/src/grfmt_tiff.cpp
@@ -1100,16 +1100,6 @@ bool TiffEncoder::isFormatSupported( int depth ) const
     return depth == CV_8U || depth == CV_8S || depth == CV_16U || depth == CV_16S || depth == CV_32S || depth == CV_32F || depth == CV_64F;
 }
 
-void  TiffEncoder::writeTag( WLByteStream& strm, TiffTag tag,
-                             TiffFieldType fieldType,
-                             int count, int value )
-{
-    strm.putWord( tag );
-    strm.putWord( fieldType );
-    strm.putDWord( count );
-    strm.putDWord( value );
-}
-
 class TiffEncoderBufHelper
 {
 public:
diff --git a/modules/imgcodecs/src/grfmt_tiff.hpp b/modules/imgcodecs/src/grfmt_tiff.hpp
index ee5bcb7018..0d1f511372 100644
--- a/modules/imgcodecs/src/grfmt_tiff.hpp
+++ b/modules/imgcodecs/src/grfmt_tiff.hpp
@@ -132,10 +132,6 @@ public:
     ImageEncoder newEncoder() const CV_OVERRIDE;
 
 protected:
-    void  writeTag( WLByteStream& strm, TiffTag tag,
-                    TiffFieldType fieldType,
-                    int count, int value );
-
     bool writeLibTiff( const std::vector<Mat>& img_vec, const std::vector<int>& params );
     bool write_32FC3_SGILOG(const Mat& img, void* tif);
 

From 2db7f8e82764cc4cd26d529155a70499833a0fdf Mon Sep 17 00:00:00 2001
From: chacha21 <pierre@chachatelier.fr>
Date: Thu, 1 Aug 2024 09:36:08 +0200
Subject: [PATCH 09/17] Adding getStdAllocator() to cv::cuda::GpuMat

To be on par with `cv::Mat`, let's add `cv::cuda::GpuMat::getStdAllocator()`
This is useful anyway, because when a user wants to use custom allocators, he might want to resort to the standard default allocator behaviour, not some other allocator that could have been set by `setDefaultAllocator()`
---
 modules/core/include/opencv2/core/cuda.hpp | 1 +
 modules/core/src/cuda/gpu_mat.cu           | 7 +++++++
 2 files changed, 8 insertions(+)

diff --git a/modules/core/include/opencv2/core/cuda.hpp b/modules/core/include/opencv2/core/cuda.hpp
index 9d210ed7b5..6cd6711582 100644
--- a/modules/core/include/opencv2/core/cuda.hpp
+++ b/modules/core/include/opencv2/core/cuda.hpp
@@ -118,6 +118,7 @@ public:
     //! default allocator
     CV_WRAP static GpuMat::Allocator* defaultAllocator();
     CV_WRAP static void setDefaultAllocator(GpuMat::Allocator* allocator);
+    CV_WRAP static GpuMat::Allocator* getStdAllocator();
 
     //! default constructor
     CV_WRAP explicit GpuMat(GpuMat::Allocator* allocator = GpuMat::defaultAllocator());
diff --git a/modules/core/src/cuda/gpu_mat.cu b/modules/core/src/cuda/gpu_mat.cu
index c286f28eb0..a86888cac3 100644
--- a/modules/core/src/cuda/gpu_mat.cu
+++ b/modules/core/src/cuda/gpu_mat.cu
@@ -135,6 +135,7 @@ namespace
 
     DefaultAllocator cudaDefaultAllocator;
     GpuMat::Allocator* g_defaultAllocator = &cudaDefaultAllocator;
+    GpuMat::Allocator* g_stdAllocator = &cudaDefaultAllocator;
 }
 
 GpuMat::Allocator* cv::cuda::GpuMat::defaultAllocator()
@@ -148,6 +149,12 @@ void cv::cuda::GpuMat::setDefaultAllocator(Allocator* allocator)
     g_defaultAllocator = allocator;
 }
 
+GpuMat::Allocator* cv::cuda::GpuMat::getStdAllocator()
+{
+    return g_stdAllocator;
+}
+
+
 /////////////////////////////////////////////////////
 /// create
 

From f67d4852bf3845febbb28f330aa5365af65195ba Mon Sep 17 00:00:00 2001
From: chacha21 <pierre@chachatelier.fr>
Date: Thu, 1 Aug 2024 10:00:31 +0200
Subject: [PATCH 10/17] Added no-imp placeholder when HAVE_CUDA is false

---
 modules/core/src/cuda_gpu_mat.cpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/modules/core/src/cuda_gpu_mat.cpp b/modules/core/src/cuda_gpu_mat.cpp
index a245b1a293..84b5b210d1 100644
--- a/modules/core/src/cuda_gpu_mat.cpp
+++ b/modules/core/src/cuda_gpu_mat.cpp
@@ -420,6 +420,11 @@ void cv::cuda::GpuMat::setDefaultAllocator(Allocator* allocator)
     throw_no_cuda();
 }
 
+GpuMat::Allocator* cv::cuda::GpuMat::getStdAllocator()
+{
+    return 0;
+}
+
 void cv::cuda::GpuMat::create(int _rows, int _cols, int _type)
 {
     CV_UNUSED(_rows);

From 35463e079c60df2d5622c762460dc054b938b305 Mon Sep 17 00:00:00 2001
From: Junyan721113 <llh721113@outlook.com>
Date: Wed, 12 Jun 2024 16:01:43 +0800
Subject: [PATCH 11/17] feat: Part 1.5 - New Interfaces

---
 3rdparty/ndsrvp/include/core.hpp              |   2 +-
 3rdparty/ndsrvp/include/imgproc.hpp           |  45 +--
 3rdparty/ndsrvp/ndsrvp_hal.hpp                |   5 +-
 3rdparty/ndsrvp/src/cvutils.cpp               |  78 +++++
 3rdparty/ndsrvp/src/cvutils.hpp               | 108 ++++++
 3rdparty/ndsrvp/src/integral.cpp              |   2 +
 3rdparty/ndsrvp/src/remap.cpp                 | 188 +++++++++++
 3rdparty/ndsrvp/src/threshold.cpp             | 147 ++++-----
 3rdparty/ndsrvp/src/warpAffine.cpp            | 174 +++-------
 3rdparty/ndsrvp/src/warpPerspective.cpp       | 208 ++++--------
 CMakeLists.txt                                |   2 +-
 .../include/opencv2/imgproc/hal/hal.hpp       |   8 +
 .../include/opencv2/imgproc/hal/interface.h   |   6 +
 modules/imgproc/src/hal_replacement.hpp       |  50 +++
 modules/imgproc/src/imgwarp.cpp               | 307 ++++++++++--------
 15 files changed, 821 insertions(+), 509 deletions(-)
 create mode 100644 3rdparty/ndsrvp/src/cvutils.cpp
 create mode 100644 3rdparty/ndsrvp/src/cvutils.hpp
 create mode 100644 3rdparty/ndsrvp/src/remap.cpp

diff --git a/3rdparty/ndsrvp/include/core.hpp b/3rdparty/ndsrvp/include/core.hpp
index 190a1b926b..ee57668539 100644
--- a/3rdparty/ndsrvp/include/core.hpp
+++ b/3rdparty/ndsrvp/include/core.hpp
@@ -1,6 +1,6 @@
 // This file is part of OpenCV project.
 // It is subject to the license terms in the LICENSE file found in the top-level directory
-// of this distribution and at http://opencv.org/license.html.	
+// of this distribution and at http://opencv.org/license.html.
 
 #ifndef OPENCV_NDSRVP_CORE_HPP
 #define OPENCV_NDSRVP_CORE_HPP
diff --git a/3rdparty/ndsrvp/include/imgproc.hpp b/3rdparty/ndsrvp/include/imgproc.hpp
index 3a572172a8..94104f0b71 100644
--- a/3rdparty/ndsrvp/include/imgproc.hpp
+++ b/3rdparty/ndsrvp/include/imgproc.hpp
@@ -1,18 +1,12 @@
 // This file is part of OpenCV project.
 // It is subject to the license terms in the LICENSE file found in the top-level directory
-// of this distribution and at http://opencv.org/license.html.	
+// of this distribution and at http://opencv.org/license.html.
 
 #ifndef OPENCV_NDSRVP_IMGPROC_HPP
 #define OPENCV_NDSRVP_IMGPROC_HPP
 
 namespace cv {
 
-// ################ remap ################
-
-void remap(InputArray _src, OutputArray _dst,
-    InputArray _map1, InputArray _map2,
-    int interpolation, int borderType, const Scalar& borderValue);
-
 namespace ndsrvp {
 
 enum InterpolationMasks {
@@ -36,23 +30,36 @@ int integral(int depth, int sdepth, int sqdepth,
 
 // ################ warpAffine ################
 
-int warpAffine(int src_type,
-    const uchar* src_data, size_t src_step, int src_width, int src_height,
-    uchar* dst_data, size_t dst_step, int dst_width, int dst_height,
-    const double M[6], int interpolation, int borderType, const double borderValue[4]);
+int warpAffineBlocklineNN(int *adelta, int *bdelta, short* xy, int X0, int Y0, int bw);
 
-#undef cv_hal_warpAffine
-#define cv_hal_warpAffine (cv::ndsrvp::warpAffine)
+#undef cv_hal_warpAffineBlocklineNN
+#define cv_hal_warpAffineBlocklineNN (cv::ndsrvp::warpAffineBlocklineNN)
+
+int warpAffineBlockline(int *adelta, int *bdelta, short* xy, short* alpha, int X0, int Y0, int bw);
+
+#undef cv_hal_warpAffineBlockline
+#define cv_hal_warpAffineBlockline (cv::ndsrvp::warpAffineBlockline)
 
 // ################ warpPerspective ################
 
-int warpPerspective(int src_type,
-    const uchar* src_data, size_t src_step, int src_width, int src_height,
-    uchar* dst_data, size_t dst_step, int dst_width, int dst_height,
-    const double M[9], int interpolation, int borderType, const double borderValue[4]);
+int warpPerspectiveBlocklineNN(const double *M, short* xy, double X0, double Y0, double W0, int bw);
+
+#undef cv_hal_warpPerspectiveBlocklineNN
+#define cv_hal_warpPerspectiveBlocklineNN (cv::ndsrvp::warpPerspectiveBlocklineNN)
+
+int warpPerspectiveBlockline(const double *M, short* xy, short* alpha, double X0, double Y0, double W0, int bw);
+
+#undef cv_hal_warpPerspectiveBlockline
+#define cv_hal_warpPerspectiveBlockline (cv::ndsrvp::warpPerspectiveBlockline)
+
+// ################ remap ################
+
+int remap32f(int src_type, const uchar *src_data, size_t src_step, int src_width, int src_height,
+    uchar *dst_data, size_t dst_step, int dst_width, int dst_height, float* mapx, size_t mapx_step,
+    float* mapy, size_t mapy_step, int interpolation, int border_type, const double border_value[4]);
 
-#undef cv_hal_warpPerspective
-#define cv_hal_warpPerspective (cv::ndsrvp::warpPerspective)
+#undef cv_hal_remap32f
+#define cv_hal_remap32f (cv::ndsrvp::remap32f)
 
 // ################ threshold ################
 
diff --git a/3rdparty/ndsrvp/ndsrvp_hal.hpp b/3rdparty/ndsrvp/ndsrvp_hal.hpp
index 7f12636520..8ceac78db3 100644
--- a/3rdparty/ndsrvp/ndsrvp_hal.hpp
+++ b/3rdparty/ndsrvp/ndsrvp_hal.hpp
@@ -1,13 +1,14 @@
 // This file is part of OpenCV project.
 // It is subject to the license terms in the LICENSE file found in the top-level directory
-// of this distribution and at http://opencv.org/license.html.	
+// of this distribution and at http://opencv.org/license.html.
 
 #ifndef OPENCV_NDSRVP_HAL_HPP
 #define OPENCV_NDSRVP_HAL_HPP
 
-#include "opencv2/core/mat.hpp"
 #include <nds_intrinsic.h>
 
+#include "opencv2/core/hal/interface.h"
+
 #include "include/core.hpp"
 #include "include/imgproc.hpp"
 #include "include/features2d.hpp"
diff --git a/3rdparty/ndsrvp/src/cvutils.cpp b/3rdparty/ndsrvp/src/cvutils.cpp
new file mode 100644
index 0000000000..48e025488f
--- /dev/null
+++ b/3rdparty/ndsrvp/src/cvutils.cpp
@@ -0,0 +1,78 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.	
+
+#include "cvutils.hpp"
+
+namespace cv {
+
+namespace ndsrvp {
+
+// fastMalloc
+
+// [0][1][2][3][4][5][6][7][8][9]
+//     ^udata
+//                          ^adata
+//              ^adata[-1] == udata
+
+void* fastMalloc(size_t size)
+{
+    uchar* udata = (uchar*)malloc(size + sizeof(void*) + CV_MALLOC_ALIGN);
+    if(!udata)
+        ndsrvp_error(Error::StsNoMem, "fastMalloc(): Not enough memory");
+    uchar** adata = (uchar**)align((size_t)((uchar**)udata + 1), CV_MALLOC_ALIGN);
+    adata[-1] = udata;
+    return adata;
+}
+
+void fastFree(void* ptr)
+{
+    if(ptr)
+    {
+        uchar* udata = ((uchar**)ptr)[-1];
+        if(!(udata < (uchar*)ptr && ((uchar*)ptr - udata) <= (ptrdiff_t)(sizeof(void*) + CV_MALLOC_ALIGN)))
+            ndsrvp_error(Error::StsBadArg, "fastFree(): Invalid memory block");
+        free(udata);
+    }
+}
+
+// borderInterpolate
+
+int borderInterpolate(int p, int len, int borderType)
+{
+    if( (unsigned)p < (unsigned)len )
+        ;
+    else if( borderType == CV_HAL_BORDER_REPLICATE )
+        p = p < 0 ? 0 : len - 1;
+    else if( borderType == CV_HAL_BORDER_REFLECT || borderType == CV_HAL_BORDER_REFLECT_101 )
+    {
+        int delta = borderType == CV_HAL_BORDER_REFLECT_101;
+        if( len == 1 )
+            return 0;
+        do
+        {
+            if( p < 0 )
+                p = -p - 1 + delta;
+            else
+                p = len - 1 - (p - len) - delta;
+        }
+        while( (unsigned)p >= (unsigned)len );
+    }
+    else if( borderType == CV_HAL_BORDER_WRAP )
+    {
+        ndsrvp_assert(len > 0);
+        if( p < 0 )
+            p -= ((p - len + 1) / len) * len;
+        if( p >= len )
+            p %= len;
+    }
+    else if( borderType == CV_HAL_BORDER_CONSTANT )
+        p = -1;
+    else
+        ndsrvp_error(Error::StsBadArg, "borderInterpolate(): Unknown/unsupported border type");
+    return p;
+}
+
+} // namespace ndsrvp
+
+} // namespace cv
diff --git a/3rdparty/ndsrvp/src/cvutils.hpp b/3rdparty/ndsrvp/src/cvutils.hpp
new file mode 100644
index 0000000000..8cf1476ed6
--- /dev/null
+++ b/3rdparty/ndsrvp/src/cvutils.hpp
@@ -0,0 +1,108 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.	
+
+#ifndef OPENCV_NDSRVP_CVUTILS_HPP
+#define OPENCV_NDSRVP_CVUTILS_HPP
+
+#include <nds_intrinsic.h>
+
+#include "opencv2/core/hal/interface.h"
+
+#include <cstring>
+#include <cmath>
+#include <iostream>
+#include <string>
+#include <array>
+#include <climits>
+#include <algorithm>
+
+// misc functions that not exposed to public interface
+
+namespace cv {
+
+namespace ndsrvp {
+
+void* fastMalloc(size_t size);
+void fastFree(void* ptr);
+int borderInterpolate(int p, int len, int borderType);
+
+#ifndef MAX
+#  define MAX(a,b)  ((a) < (b) ? (b) : (a))
+#endif
+
+#define CV_MAT_CN_MASK          ((CV_CN_MAX - 1) << CV_CN_SHIFT)
+#define CV_MAT_CN(flags)        ((((flags) & CV_MAT_CN_MASK) >> CV_CN_SHIFT) + 1)
+
+#define CV_MALLOC_ALIGN 64
+
+// error codes
+
+enum Error{
+    StsNoMem = -4,
+    StsBadArg = -5,
+    StsAssert = -215
+};
+
+// output error
+
+#define ndsrvp_assert(expr) { if(!(expr)) ndsrvp_error(Error::StsAssert, std::string(#expr)); }
+
+inline void ndsrvp_error(int code, std::string msg = "")
+{
+    std::cerr << "NDSRVP Error: code " << code << std::endl;
+    if(!msg.empty())
+        std::cerr << msg << std::endl;
+    if(code < 0)
+        throw code;
+}
+
+// clip & vclip
+
+inline int clip(int x, int a, int b)
+{
+    return x >= a ? (x < b ? x : b - 1) : a;
+}
+
+inline int32x2_t vclip(int32x2_t x, int32x2_t a, int32x2_t b)
+{
+    return (int32x2_t)__nds__bpick((long)a, __nds__bpick((long)(b - 1), (long)x, (long)(x < b)), (long)(x >= a));
+}
+
+// saturate
+
+template<typename _Tp> static inline _Tp saturate_cast(int v)    { return _Tp(v); }
+template<typename _Tp> static inline _Tp saturate_cast(float v)    { return _Tp(v); }
+template<typename _Tp> static inline _Tp saturate_cast(double v)   { return _Tp(v); }
+
+template<> inline uchar saturate_cast<uchar>(int v)     { return __nds__uclip32(v, 8); }
+template<> inline uchar saturate_cast<uchar>(float v)     { return saturate_cast<uchar>((int)lrintf(v)); }
+template<> inline uchar saturate_cast<uchar>(double v)     { return saturate_cast<uchar>((int)lrint(v)); }
+
+template<> inline char saturate_cast<char>(int v)     { return __nds__sclip32(v, 7); }
+template<> inline char saturate_cast<char>(float v)     { return saturate_cast<char>((int)lrintf(v)); }
+template<> inline char saturate_cast<char>(double v)     { return saturate_cast<char>((int)lrint(v)); }
+
+template<> inline ushort saturate_cast<ushort>(int v)     { return __nds__uclip32(v, 16); }
+template<> inline ushort saturate_cast<ushort>(float v)     { return saturate_cast<ushort>((int)lrintf(v)); }
+template<> inline ushort saturate_cast<ushort>(double v)     { return saturate_cast<ushort>((int)lrint(v)); }
+
+template<> inline short saturate_cast<short>(int v)     { return __nds__sclip32(v, 15); }
+template<> inline short saturate_cast<short>(float v)     { return saturate_cast<short>((int)lrintf(v)); }
+template<> inline short saturate_cast<short>(double v)     { return saturate_cast<short>((int)lrint(v)); }
+
+template<> inline int saturate_cast<int>(float v)     { return (int)lrintf(v); }
+template<> inline int saturate_cast<int>(double v)     { return (int)lrint(v); }
+
+// align
+
+inline long align(size_t v, int n)
+{
+    return (v + n - 1) & -n;
+}
+
+} // namespace ndsrvp
+
+} // namespace cv
+
+#endif
diff --git a/3rdparty/ndsrvp/src/integral.cpp b/3rdparty/ndsrvp/src/integral.cpp
index 37030a8d4c..e1dd993a90 100644
--- a/3rdparty/ndsrvp/src/integral.cpp
+++ b/3rdparty/ndsrvp/src/integral.cpp
@@ -3,6 +3,8 @@
 // of this distribution and at http://opencv.org/license.html.	
 
 #include "ndsrvp_hal.hpp"
+#include "opencv2/imgproc/hal/interface.h"
+#include "cvutils.hpp"
 
 namespace cv {
 
diff --git a/3rdparty/ndsrvp/src/remap.cpp b/3rdparty/ndsrvp/src/remap.cpp
new file mode 100644
index 0000000000..30e4d218e3
--- /dev/null
+++ b/3rdparty/ndsrvp/src/remap.cpp
@@ -0,0 +1,188 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "ndsrvp_hal.hpp"
+#include "opencv2/imgproc/hal/interface.h"
+#include "cvutils.hpp"
+
+namespace cv {
+
+namespace ndsrvp {
+
+int remap32f(int src_type, const uchar* src_data, size_t src_step, int src_width, int src_height,
+    uchar* dst_data, size_t dst_step, int dst_width, int dst_height, float* mapx, size_t mapx_step,
+    float* mapy, size_t mapy_step, int interpolation, int border_type, const double border_value[4])
+{
+    const bool isRelative = ((interpolation & CV_HAL_WARP_RELATIVE_MAP) != 0);
+    interpolation &= ~CV_HAL_WARP_RELATIVE_MAP;
+
+    if( interpolation == CV_HAL_INTER_AREA )
+        interpolation = CV_HAL_INTER_LINEAR;
+
+    if( interpolation != CV_HAL_INTER_NEAREST )
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+
+    // only CV_8U
+    if( (src_type & CV_MAT_DEPTH_MASK) != CV_8U )
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+
+    int cn = CV_MAT_CN(src_type);
+
+    src_step /= sizeof(uchar);
+    dst_step /= sizeof(uchar);
+
+    // mapping CV_32FC1
+    mapx_step /= sizeof(float);
+    mapy_step /= sizeof(float);
+
+    // border
+    uchar border_const[CV_CN_MAX];
+    for( int k = 0; k < CV_CN_MAX; k++ )
+        border_const[k] = saturate_cast<uchar>(border_value[k & 3]);
+
+    // divide into blocks
+    const int BLOCK_SIZE = 1024;
+    int x, y, x1, y1;
+    std::array<short, BLOCK_SIZE * BLOCK_SIZE * 2> aXY;
+    short* XY = aXY.data();
+    size_t XY_step = BLOCK_SIZE * 2;
+
+    // vectorize
+    const int32x2_t src_wh = {src_width, src_height};
+    const int32x2_t arr_index = {cn, (int)src_step};
+
+    for (y = 0; y < dst_height; y += BLOCK_SIZE)
+    {
+        int dy = std::min(BLOCK_SIZE, dst_height - y);
+        for (x = 0; x < dst_width; x += BLOCK_SIZE)
+        {
+            const int off_y = isRelative ? y : 0;
+            const int off_x = isRelative ? x : 0;
+            const int32x2_t voff = {off_x, off_y};
+
+            int dx = std::min(BLOCK_SIZE, dst_width - x);
+            // prepare mapping data XY
+            for (y1 = 0; y1 < dy; y1++)
+            {
+                short* rXY = XY + y1 * XY_step;
+                const float* sX = mapx + (y + y1) * mapx_step + x;
+                const float* sY = mapy + (y + y1) * mapy_step + x;
+                for (x1 = 0; x1 < dx; x1++)
+                {
+                    rXY[x1 * 2] = saturate_cast<short>(sX[x1]);
+                    rXY[x1 * 2 + 1] = saturate_cast<short>(sY[x1]);
+                }
+            }
+
+            // precalulate offset
+            if(isRelative)
+            {
+                int16x8_t voff_x;
+                int16x8_t voff_y = {0, 0, 1, 0, 2, 0, 3, 0};
+                int16x8_t vones_x = {4, 0, 4, 0, 4, 0, 4, 0};
+                int16x8_t vones_y = {0, 1, 0, 1, 0, 1, 0, 1};
+                for(y1 = 0; y1 < BLOCK_SIZE; y1++, voff_y += vones_y)
+                {
+                    int16x8_t* vrXY = (int16x8_t*)(XY + y1 * XY_step);
+                    for(x1 = 0, voff_x = voff_y; x1 < BLOCK_SIZE; x1 += 4, vrXY++, voff_x += vones_x)
+                    {
+                        *vrXY += voff_x;
+                    }
+                }
+            }
+
+            // process the block
+            for( y1 = 0; y1 < dy; y1++ )
+            {
+                uchar* dst_row = dst_data + (y + y1) * dst_step + x * cn;
+                const short* rXY = XY + y1 * XY_step;
+                if( cn == 1 )
+                {
+                    for( x1 = 0; x1 < dx; x1++ )
+                    {
+                        int32x2_t vsxy = (int32x2_t){rXY[x1 * 2], rXY[x1 * 2 + 1]} + voff;
+                        if( (long)((uint32x2_t)vsxy < (uint32x2_t)src_wh) == -1 )
+                            dst_row[x1] = src_data[__nds__v_smar64(0, vsxy, arr_index)];
+                        else
+                        {
+                            if( border_type == CV_HAL_BORDER_REPLICATE )
+                            {
+                                vsxy = vclip(vsxy, (int32x2_t){0, 0}, src_wh);
+                                dst_row[x1] = src_data[__nds__v_smar64(0, vsxy, arr_index)];
+                            }
+                            else if( border_type == CV_HAL_BORDER_CONSTANT )
+                                dst_row[x1] = border_const[0];
+                            else if( border_type != CV_HAL_BORDER_TRANSPARENT )
+                            {
+                                vsxy[0] = borderInterpolate(vsxy[0], src_width, border_type);
+                                vsxy[1] = borderInterpolate(vsxy[1], src_height, border_type);
+                                dst_row[x1] = src_data[__nds__v_smar64(0, vsxy, arr_index)];
+                            }
+                        }
+                    }
+                }
+                else
+                {
+                    uchar* dst_ptr = dst_row;
+                    for(x1 = 0; x1 < dx; x1++, dst_ptr += cn )
+                    {
+                        int32x2_t vsxy = (int32x2_t){rXY[x1 * 2], rXY[x1 * 2 + 1]} + voff;
+                        const uchar *src_ptr;
+                        if( (long)((uint32x2_t)vsxy < (uint32x2_t)src_wh) == -1 )
+                        {
+                            if( cn == 3 )
+                            {
+                                src_ptr = (uchar*)__nds__v_smar64((long)src_data, vsxy, arr_index);
+                                dst_ptr[0] = src_ptr[0]; dst_ptr[1] = src_ptr[1]; dst_ptr[2] = src_ptr[2];
+                                // performance loss, commented out
+                                // *(unsigned*)dst_ptr = __nds__bpick(*(unsigned*)dst_ptr, *(unsigned*)src_ptr, 0xFF000000);
+                            }
+                            else if( cn == 4 )
+                            {
+                                src_ptr = (uchar*)__nds__v_smar64((long)src_data, vsxy, arr_index);
+                                *(uint8x4_t*)dst_ptr = *(uint8x4_t*)src_ptr;
+                            }
+                            else
+                            {
+                                src_ptr = (uchar*)__nds__v_smar64((long)src_data, vsxy, arr_index);
+                                int k = cn;
+                                for(; k >= 8; k -= 8, dst_ptr += 8, src_ptr += 8)
+                                    *(uint8x8_t*)dst_ptr = *(uint8x8_t*)src_ptr;
+                                while( k-- )
+                                    dst_ptr[k] = src_ptr[k];
+                            }
+                        }
+                        else if( border_type != CV_HAL_BORDER_TRANSPARENT )
+                        {
+                            if( border_type == CV_HAL_BORDER_REPLICATE )
+                            {
+                                vsxy = vclip(vsxy, (int32x2_t){0, 0}, src_wh);
+                                src_ptr = (uchar*)__nds__v_smar64((long)src_data, vsxy, arr_index);
+                            }
+                            else if( border_type == CV_HAL_BORDER_CONSTANT )
+                                src_ptr = &border_const[0];
+                            else
+                            {
+                                vsxy[0] = borderInterpolate(vsxy[0], src_width, border_type);
+                                vsxy[1] = borderInterpolate(vsxy[1], src_height, border_type);
+                                src_ptr = (uchar*)__nds__v_smar64((long)src_data, vsxy, arr_index);
+                            }
+                            int k = cn;
+                            for(; k >= 8; k -= 8, dst_ptr += 8, src_ptr += 8)
+                                *(uint8x8_t*)dst_ptr = *(uint8x8_t*)src_ptr;
+                            while( k-- )
+                                dst_ptr[k] = src_ptr[k];
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    return CV_HAL_ERROR_OK;
+}
+
+} // namespace ndsrvp
+
+} // namespace cv
diff --git a/3rdparty/ndsrvp/src/threshold.cpp b/3rdparty/ndsrvp/src/threshold.cpp
index 06de591fef..0812100311 100644
--- a/3rdparty/ndsrvp/src/threshold.cpp
+++ b/3rdparty/ndsrvp/src/threshold.cpp
@@ -4,65 +4,44 @@
 
 #include "ndsrvp_hal.hpp"
 #include "opencv2/imgproc/hal/interface.h"
+#include "cvutils.hpp"
 
 namespace cv {
 
 namespace ndsrvp {
 
 template <typename type, typename vtype>
-class operators_threshold_t {
-public:
-    virtual ~operators_threshold_t() {};
-    virtual inline vtype vector(const vtype& src, const vtype& thresh, const vtype& maxval)
-    {
-        (void)src;
-        (void)thresh;
-        (void)maxval;
-        CV_Error(cv::Error::StsBadArg, "");
-        return vtype();
-    }
-    virtual inline type scalar(const type& src, const type& thresh, const type& maxval)
-    {
-        (void)src;
-        (void)thresh;
-        (void)maxval;
-        CV_Error(cv::Error::StsBadArg, "");
-        return type();
-    }
-};
-
-template <typename type, typename vtype>
-class opThreshBinary : public operators_threshold_t<type, vtype> {
-    inline vtype vector(const vtype& src, const vtype& thresh, const vtype& maxval) override
+struct opThreshBinary_t {
+    inline vtype vector(const vtype& src, const vtype& thresh, const vtype& maxval)
     {
         return (vtype)__nds__bpick((long)maxval, (long)0, (long)(src > thresh));
     }
-    inline type scalar(const type& src, const type& thresh, const type& maxval) override
+    inline type scalar(const type& src, const type& thresh, const type& maxval)
     {
         return src > thresh ? maxval : 0;
     }
 };
 
 template <typename type, typename vtype>
-class opThreshBinaryInv : public operators_threshold_t<type, vtype> {
-    inline vtype vector(const vtype& src, const vtype& thresh, const vtype& maxval) override
+struct opThreshBinaryInv_t {
+    inline vtype vector(const vtype& src, const vtype& thresh, const vtype& maxval)
     {
         return (vtype)__nds__bpick((long)0, (long)maxval, (long)(src > thresh));
     }
-    inline type scalar(const type& src, const type& thresh, const type& maxval) override
+    inline type scalar(const type& src, const type& thresh, const type& maxval)
     {
         return src > thresh ? 0 : maxval;
     }
 };
 
 template <typename type, typename vtype>
-class opThreshTrunc : public operators_threshold_t<type, vtype> {
-    inline vtype vector(const vtype& src, const vtype& thresh, const vtype& maxval) override
+struct opThreshTrunc_t {
+    inline vtype vector(const vtype& src, const vtype& thresh, const vtype& maxval)
     {
         (void)maxval;
         return (vtype)__nds__bpick((long)thresh, (long)src, (long)(src > thresh));
     }
-    inline type scalar(const type& src, const type& thresh, const type& maxval) override
+    inline type scalar(const type& src, const type& thresh, const type& maxval)
     {
         (void)maxval;
         return src > thresh ? thresh : src;
@@ -70,13 +49,13 @@ class opThreshTrunc : public operators_threshold_t<type, vtype> {
 };
 
 template <typename type, typename vtype>
-class opThreshToZero : public operators_threshold_t<type, vtype> {
-    inline vtype vector(const vtype& src, const vtype& thresh, const vtype& maxval) override
+struct opThreshToZero_t {
+    inline vtype vector(const vtype& src, const vtype& thresh, const vtype& maxval)
     {
         (void)maxval;
         return (vtype)__nds__bpick((long)src, (long)0, (long)(src > thresh));
     }
-    inline type scalar(const type& src, const type& thresh, const type& maxval) override
+    inline type scalar(const type& src, const type& thresh, const type& maxval)
     {
         (void)maxval;
         return src > thresh ? src : 0;
@@ -84,29 +63,36 @@ class opThreshToZero : public operators_threshold_t<type, vtype> {
 };
 
 template <typename type, typename vtype>
-class opThreshToZeroInv : public operators_threshold_t<type, vtype> {
-    inline vtype vector(const vtype& src, const vtype& thresh, const vtype& maxval) override
+struct opThreshToZeroInv_t {
+    inline vtype vector(const vtype& src, const vtype& thresh, const vtype& maxval)
     {
         (void)maxval;
         return (vtype)__nds__bpick((long)0, (long)src, (long)(src > thresh));
     }
-    inline type scalar(const type& src, const type& thresh, const type& maxval) override
+    inline type scalar(const type& src, const type& thresh, const type& maxval)
     {
         (void)maxval;
         return src > thresh ? 0 : src;
     }
 };
 
-template <typename type, typename vtype, int nlane>
-static void threshold_op(const type* src_data, size_t src_step,
-    type* dst_data, size_t dst_step,
+template <typename type, typename vtype, int nlane,
+    template <typename ttype, typename vttype> typename opThresh_t>
+static inline void threshold_op(const uchar* src, size_t src_step,
+    uchar* dst, size_t dst_step,
     int width, int height, int cn,
-    type thresh, type maxval, int thtype)
+    double thresh_d, double maxval_d)
 {
     int i, j;
     width *= cn;
+
+    type* src_data = (type*)src;
+    type* dst_data = (type*)dst;
     src_step /= sizeof(type);
     dst_step /= sizeof(type);
+
+    type thresh = saturate_cast<type>(thresh_d);
+    type maxval = saturate_cast<type>(maxval_d);
     vtype vthresh;
     vtype vmaxval;
     for (i = 0; i < nlane; i++) {
@@ -114,62 +100,63 @@ static void threshold_op(const type* src_data, size_t src_step,
         vmaxval[i] = maxval;
     }
 
-    operators_threshold_t<type, vtype>* op;
-    switch (thtype) {
-    case CV_HAL_THRESH_BINARY:
-        op = new opThreshBinary<type, vtype>();
-        break;
-    case CV_HAL_THRESH_BINARY_INV:
-        op = new opThreshBinaryInv<type, vtype>();
-        break;
-    case CV_HAL_THRESH_TRUNC:
-        op = new opThreshTrunc<type, vtype>();
-        break;
-    case CV_HAL_THRESH_TOZERO:
-        op = new opThreshToZero<type, vtype>();
-        break;
-    case CV_HAL_THRESH_TOZERO_INV:
-        op = new opThreshToZeroInv<type, vtype>();
-        break;
-    default:
-        CV_Error(cv::Error::StsBadArg, "");
-        return;
-    }
+    opThresh_t<type, vtype> opThresh;
 
     for (i = 0; i < height; i++, src_data += src_step, dst_data += dst_step) {
         for (j = 0; j <= width - nlane; j += nlane) {
-            vtype vs = *(vtype*)(src_data + j);
-            *(vtype*)(dst_data + j) = op->vector(vs, vthresh, vmaxval);
+            *(vtype*)(dst_data + j) = opThresh.vector(*(vtype*)(src_data + j), vthresh, vmaxval);
         }
         for (; j < width; j++) {
-            dst_data[j] = op->scalar(src_data[j], thresh, maxval);
+            dst_data[j] = opThresh.scalar(src_data[j], thresh, maxval);
         }
     }
 
-    delete op;
     return;
 }
 
+typedef void (*ThreshFunc)(const uchar* src_data, size_t src_step,
+    uchar* dst_data, size_t dst_step,
+    int width, int height, int cn,
+    double thresh, double maxval);
+
 int threshold(const uchar* src_data, size_t src_step,
     uchar* dst_data, size_t dst_step,
     int width, int height, int depth, int cn,
     double thresh, double maxValue, int thresholdType)
 {
-    if (width <= 255 && height <= 255) // slower at small size
-        return CV_HAL_ERROR_NOT_IMPLEMENTED;
-    if (depth == CV_8U) {
-        threshold_op<uchar, uint8x8_t, 8>((uchar*)src_data, src_step, (uchar*)dst_data, dst_step, width, height, cn, (uchar)thresh, (uchar)maxValue, thresholdType);
-        return CV_HAL_ERROR_OK;
-    } else if (depth == CV_16S) {
-        threshold_op<short, int16x4_t, 4>((short*)src_data, src_step, (short*)dst_data, dst_step, width, height, cn, (short)thresh, (short)maxValue, thresholdType);
-        return CV_HAL_ERROR_OK;
-    } else if (depth == CV_16U) {
-        threshold_op<ushort, uint16x4_t, 4>((ushort*)src_data, src_step, (ushort*)dst_data, dst_step, width, height, cn, (ushort)thresh, (ushort)maxValue, thresholdType);
-        return CV_HAL_ERROR_OK;
-    } else {
+    static ThreshFunc thfuncs[4][5] =
+    {
+        {
+            threshold_op<uchar, uint8x8_t, 8, opThreshBinary_t>,
+            threshold_op<uchar, uint8x8_t, 8, opThreshBinaryInv_t>,
+            threshold_op<uchar, uint8x8_t, 8, opThreshTrunc_t>, 
+            threshold_op<uchar, uint8x8_t, 8, opThreshToZero_t>,
+            threshold_op<uchar, uint8x8_t, 8, opThreshToZeroInv_t> },
+        {
+            threshold_op<char, int8x8_t, 8, opThreshBinary_t>,
+            threshold_op<char, int8x8_t, 8, opThreshBinaryInv_t>,
+            threshold_op<char, int8x8_t, 8, opThreshTrunc_t>, 
+            threshold_op<char, int8x8_t, 8, opThreshToZero_t>,
+            threshold_op<char, int8x8_t, 8, opThreshToZeroInv_t> },
+        {
+            threshold_op<ushort, uint16x4_t, 4, opThreshBinary_t>,
+            threshold_op<ushort, uint16x4_t, 4, opThreshBinaryInv_t>,
+            threshold_op<ushort, uint16x4_t, 4, opThreshTrunc_t>,
+            threshold_op<ushort, uint16x4_t, 4, opThreshToZero_t>,
+            threshold_op<ushort, uint16x4_t, 4, opThreshToZeroInv_t> },
+        {
+            threshold_op<short, int16x4_t, 4, opThreshBinary_t>,
+            threshold_op<short, int16x4_t, 4, opThreshBinaryInv_t>,
+            threshold_op<short, int16x4_t, 4, opThreshTrunc_t>,
+            threshold_op<short, int16x4_t, 4, opThreshToZero_t>,
+            threshold_op<short, int16x4_t, 4, opThreshToZeroInv_t> }
+    };
+
+    if(depth < 0 || depth > 3 || thresholdType < 0 || thresholdType > 4 || (width < 256 && height < 256))
         return CV_HAL_ERROR_NOT_IMPLEMENTED;
-    }
-    return CV_HAL_ERROR_NOT_IMPLEMENTED;
+
+    thfuncs[depth][thresholdType](src_data, src_step, dst_data, dst_step, width, height, cn, thresh, maxValue);
+    return CV_HAL_ERROR_OK;
 }
 
 } // namespace ndsrvp
diff --git a/3rdparty/ndsrvp/src/warpAffine.cpp b/3rdparty/ndsrvp/src/warpAffine.cpp
index d54e4dc237..4257361d1d 100644
--- a/3rdparty/ndsrvp/src/warpAffine.cpp
+++ b/3rdparty/ndsrvp/src/warpAffine.cpp
@@ -3,148 +3,68 @@
 // of this distribution and at http://opencv.org/license.html.	
 
 #include "ndsrvp_hal.hpp"
-#include "opencv2/core.hpp"
 #include "opencv2/imgproc/hal/interface.h"
+#include "cvutils.hpp"
 
 namespace cv {
 
 namespace ndsrvp {
 
-class WarpAffineInvoker : public ParallelLoopBody {
-public:
-    WarpAffineInvoker(const Mat& _src, Mat& _dst, int _interpolation, int _borderType,
-        const Scalar& _borderValue, int* _adelta, int* _bdelta, const double* _M)
-        : ParallelLoopBody()
-        , src(_src)
-        , dst(_dst)
-        , interpolation(_interpolation)
-        , borderType(_borderType)
-        , borderValue(_borderValue)
-        , adelta(_adelta)
-        , bdelta(_bdelta)
-        , M(_M)
-    {
+int warpAffineBlocklineNN(int *adelta, int *bdelta, short* xy, int X0, int Y0, int bw)
+{
+    const int AB_BITS = MAX(10, (int)INTER_BITS);
+    int x1 = 0;
+
+    for (; x1 < bw; x1 += 2) {
+        int32x2_t vX = { X0 + adelta[x1], X0 + adelta[x1 + 1] };
+        int32x2_t vY = { Y0 + bdelta[x1], Y0 + bdelta[x1 + 1] };
+
+        vX = __nds__v_sclip32(__nds__v_sra32(vX, AB_BITS), 15);
+        vY = __nds__v_sclip32(__nds__v_sra32(vY, AB_BITS), 15);
+
+        *(uint16x4_t*)(xy + x1 * 2) = (uint16x4_t)__nds__pkbb16((unsigned long)vY, (unsigned long)vX);
     }
 
-    virtual void operator()(const Range& range) const CV_OVERRIDE
-    {
-        const int BLOCK_SZ = 64;
-        AutoBuffer<short, 0> __XY(BLOCK_SZ * BLOCK_SZ * 2), __A(BLOCK_SZ * BLOCK_SZ);
-        short *XY = __XY.data(), *A = __A.data();
-        const int AB_BITS = MAX(10, (int)INTER_BITS);
-        const int AB_SCALE = 1 << AB_BITS;
-        int round_delta = interpolation == CV_HAL_INTER_NEAREST ? AB_SCALE / 2 : AB_SCALE / INTER_TAB_SIZE / 2, x, y, x1, y1;
-
-        int bh0 = std::min(BLOCK_SZ / 2, dst.rows);
-        int bw0 = std::min(BLOCK_SZ * BLOCK_SZ / bh0, dst.cols);
-        bh0 = std::min(BLOCK_SZ * BLOCK_SZ / bw0, dst.rows);
-
-        for (y = range.start; y < range.end; y += bh0) {
-            for (x = 0; x < dst.cols; x += bw0) {
-                int bw = std::min(bw0, dst.cols - x);
-                int bh = std::min(bh0, range.end - y);
-
-                Mat _XY(bh, bw, CV_16SC2, XY);
-                Mat dpart(dst, Rect(x, y, bw, bh));
-
-                for (y1 = 0; y1 < bh; y1++) {
-                    short* xy = XY + y1 * bw * 2;
-                    int X0 = saturate_cast<int>((M[1] * (y + y1) + M[2]) * AB_SCALE) + round_delta;
-                    int Y0 = saturate_cast<int>((M[4] * (y + y1) + M[5]) * AB_SCALE) + round_delta;
-
-                    if (interpolation == CV_HAL_INTER_NEAREST) {
-                        x1 = 0;
-
-                        for (; x1 < bw; x1 += 2) {
-                            int32x2_t vX = { X0 + adelta[x + x1], X0 + adelta[x + x1 + 1] };
-                            int32x2_t vY = { Y0 + bdelta[x + x1], Y0 + bdelta[x + x1 + 1] };
-
-                            vX = __nds__v_sclip32(__nds__v_sra32(vX, AB_BITS), 15);
-                            vY = __nds__v_sclip32(__nds__v_sra32(vY, AB_BITS), 15);
-
-                            *(uint16x4_t*)(xy + x1 * 2) = (uint16x4_t)__nds__pkbb16((unsigned long)vY, (unsigned long)vX);
-                        }
-
-                        for (; x1 < bw; x1++) {
-                            int X = (X0 + adelta[x + x1]) >> AB_BITS;
-                            int Y = (Y0 + bdelta[x + x1]) >> AB_BITS;
-                            xy[x1 * 2] = saturate_cast<short>(X);
-                            xy[x1 * 2 + 1] = saturate_cast<short>(Y);
-                        }
-                    } else {
-                        short* alpha = A + y1 * bw;
-                        x1 = 0;
-
-                        const int INTER_MASK = INTER_TAB_SIZE - 1;
-                        const uint32x2_t vmask = { INTER_MASK, INTER_MASK };
-                        for (; x1 < bw; x1 += 2) {
-                            int32x2_t vX = { X0 + adelta[x + x1], X0 + adelta[x + x1 + 1] };
-                            int32x2_t vY = { Y0 + bdelta[x + x1], Y0 + bdelta[x + x1 + 1] };
-                            vX = __nds__v_sra32(vX, (AB_BITS - INTER_BITS));
-                            vY = __nds__v_sra32(vY, (AB_BITS - INTER_BITS));
-
-                            int32x2_t vx = __nds__v_sclip32(__nds__v_sra32(vX, INTER_BITS), 15);
-                            int32x2_t vy = __nds__v_sclip32(__nds__v_sra32(vY, INTER_BITS), 15);
-
-                            *(uint16x4_t*)(xy + x1 * 2) = (uint16x4_t)__nds__pkbb16((unsigned long)vy, (unsigned long)vx);
-
-                            uint32x2_t valpha = __nds__v_uadd32(__nds__v_sll32((uint32x2_t)(vY & vmask), INTER_BITS), (uint32x2_t)(vX & vmask));
-                            *(int16x2_t*)(alpha + x1) = (int16x2_t) { (short)(valpha[0]), (short)(valpha[1]) };
-                        }
-
-                        for (; x1 < bw; x1++) {
-                            int X = (X0 + adelta[x + x1]) >> (AB_BITS - INTER_BITS);
-                            int Y = (Y0 + bdelta[x + x1]) >> (AB_BITS - INTER_BITS);
-                            xy[x1 * 2] = saturate_cast<short>(X >> INTER_BITS);
-                            xy[x1 * 2 + 1] = saturate_cast<short>(Y >> INTER_BITS);
-                            alpha[x1] = (short)((Y & (INTER_TAB_SIZE - 1)) * INTER_TAB_SIZE + (X & (INTER_TAB_SIZE - 1)));
-                        }
-                    }
-                }
-
-                if (interpolation == CV_HAL_INTER_NEAREST)
-                    remap(src, dpart, _XY, Mat(), interpolation, borderType, borderValue);
-                else {
-                    Mat _matA(bh, bw, CV_16U, A);
-                    remap(src, dpart, _XY, _matA, interpolation, borderType, borderValue);
-                }
-            }
-        }
+    for (; x1 < bw; x1++) {
+        int X = X0 + adelta[x1];
+        int Y = Y0 + bdelta[x1];
+        xy[x1 * 2] = saturate_cast<short>(X);
+        xy[x1 * 2 + 1] = saturate_cast<short>(Y);
     }
 
-private:
-    Mat src;
-    Mat dst;
-    int interpolation, borderType;
-    Scalar borderValue;
-    int *adelta, *bdelta;
-    const double* M;
-};
-
-int warpAffine(int src_type,
-    const uchar* src_data, size_t src_step, int src_width, int src_height,
-    uchar* dst_data, size_t dst_step, int dst_width, int dst_height,
-    const double M[6], int interpolation, int borderType, const double borderValue[4])
-{
-    Mat src(Size(src_width, src_height), src_type, const_cast<uchar*>(src_data), src_step);
-    Mat dst(Size(dst_width, dst_height), src_type, dst_data, dst_step);
+    return CV_HAL_ERROR_OK;
+}
 
-    int x;
-    AutoBuffer<int> _abdelta(dst.cols * 2);
-    int *adelta = &_abdelta[0], *bdelta = adelta + dst.cols;
+int warpAffineBlockline(int *adelta, int *bdelta, short* xy, short* alpha, int X0, int Y0, int bw)
+{
     const int AB_BITS = MAX(10, (int)INTER_BITS);
-    const int AB_SCALE = 1 << AB_BITS;
+    int x1 = 0;
+
+    const int INTER_MASK = INTER_TAB_SIZE - 1;
+    const uint32x2_t vmask = { INTER_MASK, INTER_MASK };
+    for (; x1 < bw; x1 += 2) {
+        int32x2_t vX = { X0 + adelta[x1], X0 + adelta[x1 + 1] };
+        int32x2_t vY = { Y0 + bdelta[x1], Y0 + bdelta[x1 + 1] };
+        vX = __nds__v_sra32(vX, (AB_BITS - INTER_BITS));
+        vY = __nds__v_sra32(vY, (AB_BITS - INTER_BITS));
+
+        int32x2_t vx = __nds__v_sclip32(__nds__v_sra32(vX, INTER_BITS), 15);
+        int32x2_t vy = __nds__v_sclip32(__nds__v_sra32(vY, INTER_BITS), 15);
+
+        *(uint16x4_t*)(xy + x1 * 2) = (uint16x4_t)__nds__pkbb16((unsigned long)vy, (unsigned long)vx);
+
+        uint32x2_t valpha = __nds__v_uadd32(__nds__v_sll32((uint32x2_t)(vY & vmask), INTER_BITS), (uint32x2_t)(vX & vmask));
+        *(int16x2_t*)(alpha + x1) = (int16x2_t) { (short)(valpha[0]), (short)(valpha[1]) };
+    }
 
-    for (x = 0; x < dst.cols; x++) {
-        adelta[x] = saturate_cast<int>(M[0] * x * AB_SCALE);
-        bdelta[x] = saturate_cast<int>(M[3] * x * AB_SCALE);
+    for (; x1 < bw; x1++) {
+        int X = X0 + adelta[x1];
+        int Y = Y0 + bdelta[x1];
+        xy[x1 * 2] = saturate_cast<short>(X >> INTER_BITS);
+        xy[x1 * 2 + 1] = saturate_cast<short>(Y >> INTER_BITS);
+        alpha[x1] = (short)((Y & INTER_MASK) * INTER_TAB_SIZE + (X & INTER_MASK));
     }
 
-    Range range(0, dst.rows);
-    WarpAffineInvoker invoker(src, dst, interpolation, borderType,
-        Scalar(borderValue[0], borderValue[1], borderValue[2], borderValue[3]),
-        adelta, bdelta, M);
-    parallel_for_(range, invoker, dst.total() / (double)(1 << 16));
     return CV_HAL_ERROR_OK;
 }
 
diff --git a/3rdparty/ndsrvp/src/warpPerspective.cpp b/3rdparty/ndsrvp/src/warpPerspective.cpp
index b4fa423ed7..40e44729d9 100644
--- a/3rdparty/ndsrvp/src/warpPerspective.cpp
+++ b/3rdparty/ndsrvp/src/warpPerspective.cpp
@@ -3,154 +3,90 @@
 // of this distribution and at http://opencv.org/license.html.	
 
 #include "ndsrvp_hal.hpp"
-#include "opencv2/core.hpp"
 #include "opencv2/imgproc/hal/interface.h"
+#include "cvutils.hpp"
 
 namespace cv {
 
 namespace ndsrvp {
 
-class WarpPerspectiveInvoker : public ParallelLoopBody {
-public:
-    WarpPerspectiveInvoker(const Mat& _src, Mat& _dst, const double* _M, int _interpolation,
-        int _borderType, const Scalar& _borderValue)
-        : ParallelLoopBody()
-        , src(_src)
-        , dst(_dst)
-        , M(_M)
-        , interpolation(_interpolation)
-        , borderType(_borderType)
-        , borderValue(_borderValue)
-    {
+int warpPerspectiveBlocklineNN(const double *M, short* xy, double X0, double Y0, double W0, int bw)
+{
+    int x1 = 0;
+
+    for (; x1 < bw; x1 += 2) {
+        double W1 = W0 + M[6] * x1, W2 = W1 + M[6];
+        W1 = W1 ? 1. / W1 : 0;
+        W2 = W2 ? 1. / W2 : 0;
+        double fX1 = std::max((double)INT_MIN, std::min((double)INT_MAX, (X0 + M[0] * x1) * W1));
+        double fX2 = std::max((double)INT_MIN, std::min((double)INT_MAX, (X0 + M[0] * (x1 + 1)) * W2));
+        double fY1 = std::max((double)INT_MIN, std::min((double)INT_MAX, (Y0 + M[3] * x1) * W1));
+        double fY2 = std::max((double)INT_MIN, std::min((double)INT_MAX, (Y0 + M[3] * (x1 + 1)) * W2));
+
+        int32x2_t vX = {saturate_cast<int>(fX1), saturate_cast<int>(fX2)};
+        int32x2_t vY = {saturate_cast<int>(fY1), saturate_cast<int>(fY2)};
+
+        vX = __nds__v_sclip32(vX, 15);
+        vY = __nds__v_sclip32(vY, 15);
+
+        *(uint16x4_t*)(xy + x1 * 2) = (uint16x4_t)__nds__pkbb16((unsigned long)vY, (unsigned long)vX);
     }
 
-    virtual void operator()(const Range& range) const CV_OVERRIDE
-    {
-        const int BLOCK_SZ = 32;
-        short XY[BLOCK_SZ * BLOCK_SZ * 2], A[BLOCK_SZ * BLOCK_SZ];
-        int x, y, y1, width = dst.cols, height = dst.rows;
-
-        int bh0 = std::min(BLOCK_SZ / 2, height);
-        int bw0 = std::min(BLOCK_SZ * BLOCK_SZ / bh0, width);
-        bh0 = std::min(BLOCK_SZ * BLOCK_SZ / bw0, height);
-
-        for (y = range.start; y < range.end; y += bh0) {
-            for (x = 0; x < width; x += bw0) {
-                int bw = std::min(bw0, width - x);
-                int bh = std::min(bh0, range.end - y); // height
-
-                Mat _XY(bh, bw, CV_16SC2, XY);
-                Mat dpart(dst, Rect(x, y, bw, bh));
-
-                for (y1 = 0; y1 < bh; y1++) {
-                    short* xy = XY + y1 * bw * 2;
-                    double X0 = M[0] * x + M[1] * (y + y1) + M[2];
-                    double Y0 = M[3] * x + M[4] * (y + y1) + M[5];
-                    double W0 = M[6] * x + M[7] * (y + y1) + M[8];
-
-                    if (interpolation == CV_HAL_INTER_NEAREST) {
-                        int x1 = 0;
-
-                        for (; x1 < bw; x1 += 2) {
-                            double W1 = W0 + M[6] * x1, W2 = W1 + M[6];
-                            W1 = W1 ? 1. / W1 : 0;
-                            W2 = W2 ? 1. / W2 : 0;
-                            double fX1 = std::max((double)INT_MIN, std::min((double)INT_MAX, (X0 + M[0] * x1) * W1));
-                            double fX2 = std::max((double)INT_MIN, std::min((double)INT_MAX, (X0 + M[0] * (x1 + 1)) * W2));
-                            double fY1 = std::max((double)INT_MIN, std::min((double)INT_MAX, (Y0 + M[3] * x1) * W1));
-                            double fY2 = std::max((double)INT_MIN, std::min((double)INT_MAX, (Y0 + M[3] * (x1 + 1)) * W2));
-
-                            int32x2_t vX = {saturate_cast<int>(fX1), saturate_cast<int>(fX2)};
-                            int32x2_t vY = {saturate_cast<int>(fY1), saturate_cast<int>(fY2)};
-
-                            vX = __nds__v_sclip32(vX, 15);
-                            vY = __nds__v_sclip32(vY, 15);
-
-                            *(uint16x4_t*)(xy + x1 * 2) = (uint16x4_t)__nds__pkbb16((unsigned long)vY, (unsigned long)vX);
-                        }
-
-                        for (; x1 < bw; x1++) {
-                            double W = W0 + M[6] * x1;
-                            W = W ? 1. / W : 0;
-                            double fX = std::max((double)INT_MIN, std::min((double)INT_MAX, (X0 + M[0] * x1) * W));
-                            double fY = std::max((double)INT_MIN, std::min((double)INT_MAX, (Y0 + M[3] * x1) * W));
-                            int X = saturate_cast<int>(fX);
-                            int Y = saturate_cast<int>(fY);
-
-                            xy[x1 * 2] = saturate_cast<short>(X);
-                            xy[x1 * 2 + 1] = saturate_cast<short>(Y);
-                        }
-                    } else {
-                        short* alpha = A + y1 * bw;
-                        int x1 = 0;
-
-                        const int INTER_MASK = INTER_TAB_SIZE - 1;
-                        const uint32x2_t vmask = { INTER_MASK, INTER_MASK };
-                        for (; x1 < bw; x1 += 2) {
-                            double W1 = W0 + M[6] * x1, W2 = W1 + M[6];
-                            W1 = W1 ? INTER_TAB_SIZE / W1 : 0;
-                            W2 = W2 ? INTER_TAB_SIZE / W2 : 0;
-                            double fX1 = std::max((double)INT_MIN, std::min((double)INT_MAX, (X0 + M[0] * x1) * W1));
-                            double fX2 = std::max((double)INT_MIN, std::min((double)INT_MAX, (X0 + M[0] * (x1 + 1)) * W2));
-                            double fY1 = std::max((double)INT_MIN, std::min((double)INT_MAX, (Y0 + M[3] * x1) * W1));
-                            double fY2 = std::max((double)INT_MIN, std::min((double)INT_MAX, (Y0 + M[3] * (x1 + 1)) * W2));
-
-                            int32x2_t vX = {saturate_cast<int>(fX1), saturate_cast<int>(fX2)};
-                            int32x2_t vY = {saturate_cast<int>(fY1), saturate_cast<int>(fY2)};
-
-                            int32x2_t vx = __nds__v_sclip32(__nds__v_sra32(vX, INTER_BITS), 15);
-                            int32x2_t vy = __nds__v_sclip32(__nds__v_sra32(vY, INTER_BITS), 15);
-
-                            *(uint16x4_t*)(xy + x1 * 2) = (uint16x4_t)__nds__pkbb16((unsigned long)vy, (unsigned long)vx);
-
-                            uint32x2_t valpha = __nds__v_uadd32(__nds__v_sll32((uint32x2_t)(vY & vmask), INTER_BITS), (uint32x2_t)(vX & vmask));
-                            *(int16x2_t*)(alpha + x1) = (int16x2_t) { (short)(valpha[0]), (short)(valpha[1]) };
-                        }
-
-                        for (; x1 < bw; x1++) {
-                            double W = W0 + M[6] * x1;
-                            W = W ? INTER_TAB_SIZE / W : 0;
-                            double fX = std::max((double)INT_MIN, std::min((double)INT_MAX, (X0 + M[0] * x1) * W));
-                            double fY = std::max((double)INT_MIN, std::min((double)INT_MAX, (Y0 + M[3] * x1) * W));
-                            int X = saturate_cast<int>(fX);
-                            int Y = saturate_cast<int>(fY);
-
-                            xy[x1 * 2] = saturate_cast<short>(X >> INTER_BITS);
-                            xy[x1 * 2 + 1] = saturate_cast<short>(Y >> INTER_BITS);
-                            alpha[x1] = (short)((Y & (INTER_TAB_SIZE - 1)) * INTER_TAB_SIZE + (X & (INTER_TAB_SIZE - 1)));
-                        }
-                    }
-                }
-
-                if (interpolation == CV_HAL_INTER_NEAREST)
-                    remap(src, dpart, _XY, Mat(), interpolation, borderType, borderValue);
-                else {
-                    Mat _matA(bh, bw, CV_16U, A);
-                    remap(src, dpart, _XY, _matA, interpolation, borderType, borderValue);
-                }
-            }
-        }
+    for (; x1 < bw; x1++) {
+        double W = W0 + M[6] * x1;
+        W = W ? 1. / W : 0;
+        double fX = std::max((double)INT_MIN, std::min((double)INT_MAX, (X0 + M[0] * x1) * W));
+        double fY = std::max((double)INT_MIN, std::min((double)INT_MAX, (Y0 + M[3] * x1) * W));
+        int X = saturate_cast<int>(fX);
+        int Y = saturate_cast<int>(fY);
+
+        xy[x1 * 2] = saturate_cast<short>(X);
+        xy[x1 * 2 + 1] = saturate_cast<short>(Y);
     }
 
-private:
-    Mat src;
-    Mat dst;
-    const double* M;
-    int interpolation, borderType;
-    Scalar borderValue;
-};
-
-int warpPerspective(int src_type,
-    const uchar* src_data, size_t src_step, int src_width, int src_height,
-    uchar* dst_data, size_t dst_step, int dst_width, int dst_height,
-    const double M[9], int interpolation, int borderType, const double borderValue[4])
+    return CV_HAL_ERROR_OK;
+}
+
+int warpPerspectiveBlockline(const double *M, short* xy, short* alpha, double X0, double Y0, double W0, int bw)
 {
-    Mat src(Size(src_width, src_height), src_type, const_cast<uchar*>(src_data), src_step);
-    Mat dst(Size(dst_width, dst_height), src_type, dst_data, dst_step);
+    int x1 = 0;
+
+    const int INTER_MASK = INTER_TAB_SIZE - 1;
+    const uint32x2_t vmask = { INTER_MASK, INTER_MASK };
+    for (; x1 < bw; x1 += 2) {
+        double W1 = W0 + M[6] * x1, W2 = W1 + M[6];
+        W1 = W1 ? INTER_TAB_SIZE / W1 : 0;
+        W2 = W2 ? INTER_TAB_SIZE / W2 : 0;
+        double fX1 = std::max((double)INT_MIN, std::min((double)INT_MAX, (X0 + M[0] * x1) * W1));
+        double fX2 = std::max((double)INT_MIN, std::min((double)INT_MAX, (X0 + M[0] * (x1 + 1)) * W2));
+        double fY1 = std::max((double)INT_MIN, std::min((double)INT_MAX, (Y0 + M[3] * x1) * W1));
+        double fY2 = std::max((double)INT_MIN, std::min((double)INT_MAX, (Y0 + M[3] * (x1 + 1)) * W2));
+
+        int32x2_t vX = {saturate_cast<int>(fX1), saturate_cast<int>(fX2)};
+        int32x2_t vY = {saturate_cast<int>(fY1), saturate_cast<int>(fY2)};
+
+        int32x2_t vx = __nds__v_sclip32(__nds__v_sra32(vX, INTER_BITS), 15);
+        int32x2_t vy = __nds__v_sclip32(__nds__v_sra32(vY, INTER_BITS), 15);
+
+        *(uint16x4_t*)(xy + x1 * 2) = (uint16x4_t)__nds__pkbb16((unsigned long)vy, (unsigned long)vx);
+
+        uint32x2_t valpha = __nds__v_uadd32(__nds__v_sll32((uint32x2_t)(vY & vmask), INTER_BITS), (uint32x2_t)(vX & vmask));
+        *(int16x2_t*)(alpha + x1) = (int16x2_t) { (short)(valpha[0]), (short)(valpha[1]) };
+    }
+
+    for (; x1 < bw; x1++) {
+        double W = W0 + M[6] * x1;
+        W = W ? INTER_TAB_SIZE / W : 0;
+        double fX = std::max((double)INT_MIN, std::min((double)INT_MAX, (X0 + M[0] * x1) * W));
+        double fY = std::max((double)INT_MIN, std::min((double)INT_MAX, (Y0 + M[3] * x1) * W));
+        int X = saturate_cast<int>(fX);
+        int Y = saturate_cast<int>(fY);
+
+        xy[x1 * 2] = saturate_cast<short>(X >> INTER_BITS);
+        xy[x1 * 2 + 1] = saturate_cast<short>(Y >> INTER_BITS);
+        alpha[x1] = (short)((Y & INTER_MASK) * INTER_TAB_SIZE + (X & INTER_MASK));
+    }
 
-    Range range(0, dst.rows);
-    WarpPerspectiveInvoker invoker(src, dst, M, interpolation, borderType, Scalar(borderValue[0], borderValue[1], borderValue[2], borderValue[3]));
-    parallel_for_(range, invoker, dst.total() / (double)(1 << 16));
     return CV_HAL_ERROR_OK;
 }
 
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0693731a8b..29d05cd86b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1040,7 +1040,7 @@ foreach(hal ${OpenCV_HAL})
       ocv_hal_register(NDSRVP_HAL_LIBRARIES NDSRVP_HAL_HEADERS NDSRVP_HAL_INCLUDE_DIRS)
       list(APPEND OpenCV_USED_HAL "ndsrvp (ver ${NDSRVP_HAL_VERSION})")
     else()
-      message(STATUS "NDSRVP: Andes GNU Toolchain DSP extension is not open, disabling ndsrvp...")
+      message(STATUS "NDSRVP: Andes GNU Toolchain DSP extension is not enabled, disabling ndsrvp...")
     endif()
   elseif(hal STREQUAL "halrvv")
     if(";${CPU_BASELINE_FINAL};" MATCHES ";RVV;")
diff --git a/modules/imgproc/include/opencv2/imgproc/hal/hal.hpp b/modules/imgproc/include/opencv2/imgproc/hal/hal.hpp
index 48851ece07..d4b0f3fbb9 100644
--- a/modules/imgproc/include/opencv2/imgproc/hal/hal.hpp
+++ b/modules/imgproc/include/opencv2/imgproc/hal/hal.hpp
@@ -108,11 +108,19 @@ CV_EXPORTS void warpAffine(int src_type,
                            uchar * dst_data, size_t dst_step, int dst_width, int dst_height,
                            const double M[6], int interpolation, int borderType, const double borderValue[4]);
 
+CV_EXPORTS void warpAffineBlocklineNN(int *adelta, int *bdelta, short* xy, int X0, int Y0, int bw);
+
+CV_EXPORTS void warpAffineBlockline(int *adelta, int *bdelta, short* xy, short* alpha, int X0, int Y0, int bw);
+
 CV_EXPORTS void warpPerspective(int src_type,
                                const uchar * src_data, size_t src_step, int src_width, int src_height,
                                uchar * dst_data, size_t dst_step, int dst_width, int dst_height,
                                const double M[9], int interpolation, int borderType, const double borderValue[4]);
 
+CV_EXPORTS void warpPerspectiveBlocklineNN(const double *M, short* xy, double X0, double Y0, double W0, int bw);
+
+CV_EXPORTS void warpPerspectiveBlockline(const double *M, short* xy, short* alpha, double X0, double Y0, double W0, int bw);
+
 CV_EXPORTS void cvtBGRtoBGR(const uchar * src_data, size_t src_step,
                             uchar * dst_data, size_t dst_step,
                             int width, int height,
diff --git a/modules/imgproc/include/opencv2/imgproc/hal/interface.h b/modules/imgproc/include/opencv2/imgproc/hal/interface.h
index f8dbcfe791..8e485b9fca 100644
--- a/modules/imgproc/include/opencv2/imgproc/hal/interface.h
+++ b/modules/imgproc/include/opencv2/imgproc/hal/interface.h
@@ -12,6 +12,12 @@
 #define CV_HAL_INTER_CUBIC 2
 #define CV_HAL_INTER_AREA 3
 #define CV_HAL_INTER_LANCZOS4 4
+#define CV_HAL_INTER_LINEAR_EXACT 5
+#define CV_HAL_INTER_NEAREST_EXACT 6
+#define CV_HAL_INTER_MAX 7
+#define CV_HAL_WARP_FILL_OUTLIERS 8
+#define CV_HAL_WARP_INVERSE_MAP 16
+#define CV_HAL_WARP_RELATIVE_MAP 32
 //! @}
 
 //! @name Morphology operations
diff --git a/modules/imgproc/src/hal_replacement.hpp b/modules/imgproc/src/hal_replacement.hpp
index 773fed9b48..ceb6c8b0f6 100644
--- a/modules/imgproc/src/hal_replacement.hpp
+++ b/modules/imgproc/src/hal_replacement.hpp
@@ -273,6 +273,29 @@ inline int hal_ni_resize(int src_type, const uchar *src_data, size_t src_step, i
    @sa cv::warpAffine, cv::hal::warpAffine
  */
 inline int hal_ni_warpAffine(int src_type, const uchar *src_data, size_t src_step, int src_width, int src_height, uchar *dst_data, size_t dst_step, int dst_width, int dst_height, const double M[6], int interpolation, int borderType, const double borderValue[4]) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+/**
+   @brief hal_warpAffineBlocklineNN doing a row of affine transformation
+   @param adelta input M0 * x array
+   @param bdelta input M3 * x array
+   @param xy output (x', y') coordinates
+   @param X0 input M1 * y + M2 value
+   @param Y0 input M4 * y + M5 value
+   @param bw length of the row
+   @sa cv::warpAffineBlocklineNN, cv::hal::warpAffineBlocklineNN
+ */
+inline int hal_ni_warpAffineBlocklineNN(int *adelta, int *bdelta, short* xy, int X0, int Y0, int bw) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+/**
+   @brief hal_warpAffineBlockline doing a row of affine transformation
+   @param adelta input M0 * x array
+   @param bdelta input M3 * x array
+   @param xy output (x', y') coordinates
+   @param alpha output least significant bits of the (x', y') coordinates for interpolation
+   @param X0 input M1 * y + M2 value
+   @param Y0 input M4 * y + M5 value
+   @param bw length of the row
+   @sa cv::warpAffineBlockline, cv::hal::warpAffineBlockline
+ */
+inline int hal_ni_warpAffineBlockline(int *adelta, int *bdelta, short* xy, short* alpha, int X0, int Y0, int bw) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
 /**
    @brief hal_warpPerspective
    @param src_type source and destination image type
@@ -291,11 +314,38 @@ inline int hal_ni_warpAffine(int src_type, const uchar *src_data, size_t src_ste
    @sa cv::warpPerspective, cv::hal::warpPerspective
  */
 inline int hal_ni_warpPerspective(int src_type, const uchar *src_data, size_t src_step, int src_width, int src_height, uchar *dst_data, size_t dst_step, int dst_width, int dst_height, const double M[9], int interpolation, int borderType, const double borderValue[4]) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+/**
+   @brief hal_warpPerspectiveBlocklineNN doing a row of perspective transformation
+   @param M 3x3 matrix with transform coefficients
+   @param xy output (x', y') coordinates
+   @param X0 input M0 * x0 + M1 * y + M2 value
+   @param Y0 input M3 * x0 + M4 * y + M5 value
+   @param W0 input M6 * x0 + M7 * y + M8 value
+   @param bw length of the row
+   @sa cv::warpPerspectiveBlocklineNN, cv::hal::warpPerspectiveBlocklineNN
+ */
+inline int hal_ni_warpPerspectiveBlocklineNN(const double *M, short* xy, double X0, double Y0, double W0, int bw) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+/**
+   @brief hal_warpPerspectiveBlockline doing a row of perspective transformation
+   @param M 3x3 matrix with transform coefficients
+   @param xy output (x', y') coordinates
+   @param alpha output least significant bits of the (x', y') coordinates for interpolation
+   @param X0 input M0 * x0 + M1 * y + M2 value
+   @param Y0 input M3 * x0 + M4 * y + M5 value
+   @param W0 input M6 * x0 + M7 * y + M8 value
+   @param bw length of the row
+   @sa cv::warpPerspectiveBlockline, cv::hal::warpPerspectiveBlockline
+ */
+inline int hal_ni_warpPerspectiveBlockline(const double *M, short* xy, short* alpha, double X0, double Y0, double W0, int bw) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
 
 //! @cond IGNORED
 #define cv_hal_resize hal_ni_resize
 #define cv_hal_warpAffine hal_ni_warpAffine
+#define cv_hal_warpAffineBlocklineNN hal_ni_warpAffineBlocklineNN
+#define cv_hal_warpAffineBlockline hal_ni_warpAffineBlockline
 #define cv_hal_warpPerspective hal_ni_warpPerspective
+#define cv_hal_warpPerspectiveBlocklineNN hal_ni_warpPerspectiveBlocklineNN
+#define cv_hal_warpPerspectiveBlockline hal_ni_warpPerspectiveBlockline
 //! @endcond
 
 /**
diff --git a/modules/imgproc/src/imgwarp.cpp b/modules/imgproc/src/imgwarp.cpp
index c0eaf8114c..4e4d718da3 100644
--- a/modules/imgproc/src/imgwarp.cpp
+++ b/modules/imgproc/src/imgwarp.cpp
@@ -2268,16 +2268,7 @@ public:
         short *XY = __XY.data(), *A = __A.data();
         const int AB_BITS = MAX(10, (int)INTER_BITS);
         const int AB_SCALE = 1 << AB_BITS;
-        int round_delta = interpolation == INTER_NEAREST ? AB_SCALE/2 : AB_SCALE/INTER_TAB_SIZE/2, x, y, x1, y1;
-    #if CV_TRY_AVX2
-        bool useAVX2 = CV_CPU_HAS_SUPPORT_AVX2;
-    #endif
-    #if CV_TRY_SSE4_1
-        bool useSSE4_1 = CV_CPU_HAS_SUPPORT_SSE4_1;
-    #endif
-    #if CV_TRY_LASX
-        bool useLASX = CV_CPU_HAS_SUPPORT_LASX;
-    #endif
+        int round_delta = interpolation == INTER_NEAREST ? AB_SCALE/2 : AB_SCALE/INTER_TAB_SIZE/2, x, y, y1;
 
         int bh0 = std::min(BLOCK_SZ/2, dst.rows);
         int bw0 = std::min(BLOCK_SZ*BLOCK_SZ/bh0, dst.cols);
@@ -2300,84 +2291,9 @@ public:
                     int Y0 = saturate_cast<int>((M[4]*(y + y1) + M[5])*AB_SCALE) + round_delta;
 
                     if( interpolation == INTER_NEAREST )
-                    {
-                        x1 = 0;
-                        #if CV_TRY_SSE4_1
-                        if( useSSE4_1 )
-                            opt_SSE4_1::WarpAffineInvoker_Blockline_SSE41(adelta + x, bdelta + x, xy, X0, Y0, bw);
-                        else
-                        #endif
-                        {
-                            #if CV_SIMD128
-                            {
-                                v_int32x4 v_X0 = v_setall_s32(X0), v_Y0 = v_setall_s32(Y0);
-                                int span = VTraits<v_uint16x8>::vlanes();
-                                for( ; x1 <= bw - span; x1 += span )
-                                {
-                                    v_int16x8 v_dst[2];
-                                    #define CV_CONVERT_MAP(ptr,offset,shift) v_pack(v_shr<AB_BITS>(v_add(shift,v_load(ptr + offset))),\
-                                                                                    v_shr<AB_BITS>(v_add(shift,v_load(ptr + offset + 4))))
-                                    v_dst[0] = CV_CONVERT_MAP(adelta, x+x1, v_X0);
-                                    v_dst[1] = CV_CONVERT_MAP(bdelta, x+x1, v_Y0);
-                                    #undef CV_CONVERT_MAP
-                                    v_store_interleave(xy + (x1 << 1), v_dst[0], v_dst[1]);
-                                }
-                            }
-                            #endif
-                            for( ; x1 < bw; x1++ )
-                            {
-                                int X = (X0 + adelta[x+x1]) >> AB_BITS;
-                                int Y = (Y0 + bdelta[x+x1]) >> AB_BITS;
-                                xy[x1*2] = saturate_cast<short>(X);
-                                xy[x1*2+1] = saturate_cast<short>(Y);
-                            }
-                        }
-                    }
+                        hal::warpAffineBlocklineNN(adelta + x, bdelta + x, xy, X0, Y0, bw);
                     else
-                    {
-                        short* alpha = A + y1*bw;
-                        x1 = 0;
-                        #if CV_TRY_AVX2
-                        if ( useAVX2 )
-                            x1 = opt_AVX2::warpAffineBlockline(adelta + x, bdelta + x, xy, alpha, X0, Y0, bw);
-                        #endif
-                        #if CV_TRY_LASX
-                        if ( useLASX )
-                            x1 = opt_LASX::warpAffineBlockline(adelta + x, bdelta + x, xy, alpha, X0, Y0, bw);
-                        #endif
-                        #if CV_SIMD128
-                        {
-                            v_int32x4 v__X0 = v_setall_s32(X0), v__Y0 = v_setall_s32(Y0);
-                            v_int32x4 v_mask = v_setall_s32(INTER_TAB_SIZE - 1);
-                            int span = VTraits<v_float32x4>::vlanes();
-                            for( ; x1 <= bw - span * 2; x1 += span * 2 )
-                            {
-                                v_int32x4 v_X0 = v_shr<AB_BITS - INTER_BITS>(v_add(v__X0, v_load(this->adelta + x + x1)));
-                                v_int32x4 v_Y0 = v_shr<AB_BITS - INTER_BITS>(v_add(v__Y0, v_load(this->bdelta + x + x1)));
-                                v_int32x4 v_X1 = v_shr<AB_BITS - INTER_BITS>(v_add(v__X0, v_load(this->adelta + x + x1 + span)));
-                                v_int32x4 v_Y1 = v_shr<AB_BITS - INTER_BITS>(v_add(v__Y0, v_load(this->bdelta + x + x1 + span)));
-
-                                v_int16x8 v_xy[2];
-                                v_xy[0] = v_pack(v_shr<INTER_BITS>(v_X0), v_shr<INTER_BITS>(v_X1));
-                                v_xy[1] = v_pack(v_shr<INTER_BITS>(v_Y0), v_shr<INTER_BITS>(v_Y1));
-                                v_store_interleave(xy + (x1 << 1), v_xy[0], v_xy[1]);
-
-                                v_int32x4 v_alpha0 = v_or(v_shl<INTER_BITS>(v_and(v_Y0, v_mask)), v_and(v_X0, v_mask));
-                                v_int32x4 v_alpha1 = v_or(v_shl<INTER_BITS>(v_and(v_Y1, v_mask)), v_and(v_X1, v_mask));
-                                v_store(alpha + x1, v_pack(v_alpha0, v_alpha1));
-                            }
-                        }
-                        #endif
-                        for( ; x1 < bw; x1++ )
-                        {
-                            int X = (X0 + adelta[x+x1]) >> (AB_BITS - INTER_BITS);
-                            int Y = (Y0 + bdelta[x+x1]) >> (AB_BITS - INTER_BITS);
-                            xy[x1*2] = saturate_cast<short>(X >> INTER_BITS);
-                            xy[x1*2+1] = saturate_cast<short>(Y >> INTER_BITS);
-                            alpha[x1] = (short)((Y & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE +
-                                    (X & (INTER_TAB_SIZE-1)));
-                        }
-                    }
+                        hal::warpAffineBlockline(adelta + x, bdelta + x, xy, A + y1*bw, X0, Y0, bw);
                 }
 
                 if( interpolation == INTER_NEAREST )
@@ -2802,6 +2718,97 @@ void warpAffine(int src_type,
     parallel_for_(range, invoker, dst.total()/(double)(1<<16));
 }
 
+void warpAffineBlocklineNN(int *adelta, int *bdelta, short* xy, int X0, int Y0, int bw)
+{
+    CALL_HAL(warpAffineBlocklineNN, cv_hal_warpAffineBlocklineNN, adelta, bdelta, xy, X0, Y0, bw);
+
+    const int AB_BITS = MAX(10, (int)INTER_BITS);
+    int x1 = 0;
+    #if CV_TRY_SSE4_1
+    bool useSSE4_1 = CV_CPU_HAS_SUPPORT_SSE4_1;
+    if( useSSE4_1 )
+        opt_SSE4_1::WarpAffineInvoker_Blockline_SSE41(adelta, bdelta, xy, X0, Y0, bw);
+    else
+    #endif
+    {
+        #if CV_SIMD128
+        {
+            v_int32x4 v_X0 = v_setall_s32(X0), v_Y0 = v_setall_s32(Y0);
+            int span = VTraits<v_uint16x8>::vlanes();
+            for( ; x1 <= bw - span; x1 += span )
+            {
+                v_int16x8 v_dst[2];
+                #define CV_CONVERT_MAP(ptr,offset,shift) v_pack(v_shr<AB_BITS>(v_add(shift,v_load(ptr + offset))),\
+                                                                v_shr<AB_BITS>(v_add(shift,v_load(ptr + offset + 4))))
+                v_dst[0] = CV_CONVERT_MAP(adelta, x1, v_X0);
+                v_dst[1] = CV_CONVERT_MAP(bdelta, x1, v_Y0);
+                #undef CV_CONVERT_MAP
+                v_store_interleave(xy + (x1 << 1), v_dst[0], v_dst[1]);
+            }
+        }
+        #endif
+        for( ; x1 < bw; x1++ )
+        {
+            int X = (X0 + adelta[x1]) >> AB_BITS;
+            int Y = (Y0 + bdelta[x1]) >> AB_BITS;
+            xy[x1*2] = saturate_cast<short>(X);
+            xy[x1*2+1] = saturate_cast<short>(Y);
+        }
+    }
+}
+
+void warpAffineBlockline(int *adelta, int *bdelta, short* xy, short* alpha, int X0, int Y0, int bw)
+{
+    CALL_HAL(warpAffineBlockline, cv_hal_warpAffineBlockline, adelta, bdelta, xy, alpha, X0, Y0, bw);
+
+    const int AB_BITS = MAX(10, (int)INTER_BITS);
+    int x1 = 0;
+    #if CV_TRY_AVX2
+    bool useAVX2 = CV_CPU_HAS_SUPPORT_AVX2;
+    if ( useAVX2 )
+        x1 = opt_AVX2::warpAffineBlockline(adelta, bdelta, xy, alpha, X0, Y0, bw);
+    #endif
+    #if CV_TRY_LASX
+    bool useLASX = CV_CPU_HAS_SUPPORT_LASX;
+    if ( useLASX )
+        x1 = opt_LASX::warpAffineBlockline(adelta, bdelta, xy, alpha, X0, Y0, bw);
+    #endif
+    {
+        #if CV_SIMD128
+        {
+            v_int32x4 v__X0 = v_setall_s32(X0), v__Y0 = v_setall_s32(Y0);
+            v_int32x4 v_mask = v_setall_s32(INTER_TAB_SIZE - 1);
+            int span = VTraits<v_float32x4>::vlanes();
+            for( ; x1 <= bw - span * 2; x1 += span * 2 )
+            {
+                v_int32x4 v_X0 = v_shr<AB_BITS - INTER_BITS>(v_add(v__X0, v_load(adelta + x1)));
+                v_int32x4 v_Y0 = v_shr<AB_BITS - INTER_BITS>(v_add(v__Y0, v_load(bdelta + x1)));
+                v_int32x4 v_X1 = v_shr<AB_BITS - INTER_BITS>(v_add(v__X0, v_load(adelta + x1 + span)));
+                v_int32x4 v_Y1 = v_shr<AB_BITS - INTER_BITS>(v_add(v__Y0, v_load(bdelta + x1 + span)));
+
+                v_int16x8 v_xy[2];
+                v_xy[0] = v_pack(v_shr<INTER_BITS>(v_X0), v_shr<INTER_BITS>(v_X1));
+                v_xy[1] = v_pack(v_shr<INTER_BITS>(v_Y0), v_shr<INTER_BITS>(v_Y1));
+                v_store_interleave(xy + (x1 << 1), v_xy[0], v_xy[1]);
+
+                v_int32x4 v_alpha0 = v_or(v_shl<INTER_BITS>(v_and(v_Y0, v_mask)), v_and(v_X0, v_mask));
+                v_int32x4 v_alpha1 = v_or(v_shl<INTER_BITS>(v_and(v_Y1, v_mask)), v_and(v_X1, v_mask));
+                v_store(alpha + x1, v_pack(v_alpha0, v_alpha1));
+            }
+        }
+        #endif
+        for( ; x1 < bw; x1++ )
+        {
+            int X = (X0 + adelta[x1]) >> (AB_BITS - INTER_BITS);
+            int Y = (Y0 + bdelta[x1]) >> (AB_BITS - INTER_BITS);
+            xy[x1*2] = saturate_cast<short>(X >> INTER_BITS);
+            xy[x1*2+1] = saturate_cast<short>(Y >> INTER_BITS);
+            alpha[x1] = (short)((Y & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE +
+                    (X & (INTER_TAB_SIZE-1)));
+        }
+    }
+}
+
 } // hal::
 } // cv::
 
@@ -3204,12 +3211,6 @@ public:
         int bw0 = std::min(BLOCK_SZ*BLOCK_SZ/bh0, width);
         bh0 = std::min(BLOCK_SZ*BLOCK_SZ/bw0, height);
 
-        #if CV_TRY_SSE4_1
-        Ptr<opt_SSE4_1::WarpPerspectiveLine_SSE4> pwarp_impl_sse4;
-        if(CV_CPU_HAS_SUPPORT_SSE4_1)
-            pwarp_impl_sse4 = opt_SSE4_1::WarpPerspectiveLine_SSE4::getImpl(M);
-        #endif
-
         for( y = range.start; y < range.end; y += bh0 )
         {
             for( x = 0; x < width; x += bw0 )
@@ -3228,57 +3229,9 @@ public:
                     double W0 = M[6]*x + M[7]*(y + y1) + M[8];
 
                     if( interpolation == INTER_NEAREST )
-                    {
-                        #if CV_TRY_SSE4_1
-                        if (pwarp_impl_sse4)
-                            pwarp_impl_sse4->processNN(M, xy, X0, Y0, W0, bw);
-                        else
-                        #endif
-                        #if CV_SIMD128_64F
-                        WarpPerspectiveLine_ProcessNN_CV_SIMD(M, xy, X0, Y0, W0, bw);
-                        #else
-                        for( int x1 = 0; x1 < bw; x1++ )
-                        {
-                            double W = W0 + M[6]*x1;
-                            W = W ? 1./W : 0;
-                            double fX = std::max((double)INT_MIN, std::min((double)INT_MAX, (X0 + M[0]*x1)*W));
-                            double fY = std::max((double)INT_MIN, std::min((double)INT_MAX, (Y0 + M[3]*x1)*W));
-                            int X = saturate_cast<int>(fX);
-                            int Y = saturate_cast<int>(fY);
-
-                            xy[x1*2] = saturate_cast<short>(X);
-                            xy[x1*2+1] = saturate_cast<short>(Y);
-                        }
-                        #endif
-                    }
+                        hal::warpPerspectiveBlocklineNN(M, xy, X0, Y0, W0, bw);
                     else
-                    {
-                        short* alpha = A + y1*bw;
-
-                        #if CV_TRY_SSE4_1
-                        if (pwarp_impl_sse4)
-                            pwarp_impl_sse4->process(M, xy, alpha, X0, Y0, W0, bw);
-                        else
-                        #endif
-                        #if CV_SIMD128_64F
-                        WarpPerspectiveLine_Process_CV_SIMD(M, xy, alpha, X0, Y0, W0, bw);
-                        #else
-                        for( int x1 = 0; x1 < bw; x1++ )
-                        {
-                            double W = W0 + M[6]*x1;
-                            W = W ? INTER_TAB_SIZE/W : 0;
-                            double fX = std::max((double)INT_MIN, std::min((double)INT_MAX, (X0 + M[0]*x1)*W));
-                            double fY = std::max((double)INT_MIN, std::min((double)INT_MAX, (Y0 + M[3]*x1)*W));
-                            int X = saturate_cast<int>(fX);
-                            int Y = saturate_cast<int>(fY);
-
-                            xy[x1*2] = saturate_cast<short>(X >> INTER_BITS);
-                            xy[x1*2+1] = saturate_cast<short>(Y >> INTER_BITS);
-                            alpha[x1] = (short)((Y & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE +
-                                                (X & (INTER_TAB_SIZE-1)));
-                        }
-                        #endif
-                    }
+                        hal::warpPerspectiveBlockline(M, xy, A + y1*bw, X0, Y0, W0, bw);
                 }
 
                 if( interpolation == INTER_NEAREST )
@@ -3371,6 +3324,74 @@ void warpPerspective(int src_type,
     parallel_for_(range, invoker, dst.total()/(double)(1<<16));
 }
 
+void warpPerspectiveBlocklineNN(const double *M, short* xy, double X0, double Y0, double W0, int bw)
+{
+    CALL_HAL(warpPerspectiveBlocklineNN, cv_hal_warpPerspectiveBlocklineNN, M, xy, X0, Y0, W0, bw);
+
+    #if CV_TRY_SSE4_1
+    Ptr<opt_SSE4_1::WarpPerspectiveLine_SSE4> pwarp_impl_sse4;
+    if(CV_CPU_HAS_SUPPORT_SSE4_1)
+        pwarp_impl_sse4 = opt_SSE4_1::WarpPerspectiveLine_SSE4::getImpl(M);
+
+    if (pwarp_impl_sse4)
+        pwarp_impl_sse4->processNN(M, xy, X0, Y0, W0, bw);
+    else
+    #endif
+    {
+        #if CV_SIMD128_64F
+        WarpPerspectiveLine_ProcessNN_CV_SIMD(M, xy, X0, Y0, W0, bw);
+        #else
+        for( int x1 = 0; x1 < bw; x1++ )
+        {
+            double W = W0 + M[6]*x1;
+            W = W ? 1./W : 0;
+            double fX = std::max((double)INT_MIN, std::min((double)INT_MAX, (X0 + M[0]*x1)*W));
+            double fY = std::max((double)INT_MIN, std::min((double)INT_MAX, (Y0 + M[3]*x1)*W));
+            int X = saturate_cast<int>(fX);
+            int Y = saturate_cast<int>(fY);
+
+            xy[x1*2] = saturate_cast<short>(X);
+            xy[x1*2+1] = saturate_cast<short>(Y);
+        }
+        #endif
+    }
+}
+
+void warpPerspectiveBlockline(const double *M, short* xy, short* alpha, double X0, double Y0, double W0, int bw)
+{
+    CALL_HAL(warpPerspectiveBlockline, cv_hal_warpPerspectiveBlockline, M, xy, alpha, X0, Y0, W0, bw);
+
+    #if CV_TRY_SSE4_1
+    Ptr<opt_SSE4_1::WarpPerspectiveLine_SSE4> pwarp_impl_sse4;
+    if(CV_CPU_HAS_SUPPORT_SSE4_1)
+        pwarp_impl_sse4 = opt_SSE4_1::WarpPerspectiveLine_SSE4::getImpl(M);
+
+    if (pwarp_impl_sse4)
+        pwarp_impl_sse4->process(M, xy, alpha, X0, Y0, W0, bw);
+    else
+    #endif
+    {
+        #if CV_SIMD128_64F
+        WarpPerspectiveLine_Process_CV_SIMD(M, xy, alpha, X0, Y0, W0, bw);
+        #else
+        for( int x1 = 0; x1 < bw; x1++ )
+        {
+            double W = W0 + M[6]*x1;
+            W = W ? INTER_TAB_SIZE/W : 0;
+            double fX = std::max((double)INT_MIN, std::min((double)INT_MAX, (X0 + M[0]*x1)*W));
+            double fY = std::max((double)INT_MIN, std::min((double)INT_MAX, (Y0 + M[3]*x1)*W));
+            int X = saturate_cast<int>(fX);
+            int Y = saturate_cast<int>(fY);
+
+            xy[x1*2] = saturate_cast<short>(X >> INTER_BITS);
+            xy[x1*2+1] = saturate_cast<short>(Y >> INTER_BITS);
+            alpha[x1] = (short)((Y & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE +
+                                (X & (INTER_TAB_SIZE-1)));
+        }
+        #endif
+    }
+}
+
 } // hal::
 } // cv::
 

From 340a390ea2a152aadf5866428e1457c51c2ec10b Mon Sep 17 00:00:00 2001
From: stepkamipt <stepkamipt@gmail.com>
Date: Fri, 2 Aug 2024 10:07:36 +0200
Subject: [PATCH 12/17] Fix path to 3rdparty cmake.

Current code using CMAKE_SOURCE_DIR and it works well if opencv is standalone CMake project,
but in case of building OpenCV as part of a larger CMake project (e.g. one that includes
opencv and opencv_contrib) this path is incorrect, unlike OpenCV_SOURCE_DIR
---
 modules/videoio/cmake/detect_obsensor.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/videoio/cmake/detect_obsensor.cmake b/modules/videoio/cmake/detect_obsensor.cmake
index f0b66015eb..c7e6164c0f 100644
--- a/modules/videoio/cmake/detect_obsensor.cmake
+++ b/modules/videoio/cmake/detect_obsensor.cmake
@@ -1,7 +1,7 @@
 # --- obsensor ---
 if(NOT HAVE_OBSENSOR)
   if(OBSENSOR_USE_ORBBEC_SDK)
-    include(${CMAKE_SOURCE_DIR}/3rdparty/orbbecsdk/orbbecsdk.cmake)
+    include("${OpenCV_SOURCE_DIR}/3rdparty/orbbecsdk/orbbecsdk.cmake")
     download_orbbec_sdk(ORBBEC_SDK_ROOT_DIR)
     message(STATUS "ORBBEC_SDK_ROOT_DIR: ${ORBBEC_SDK_ROOT_DIR}")
     if(ORBBEC_SDK_ROOT_DIR)

From 796974cccc5c3e6dfcf6e21583cc7c62fa8e372b Mon Sep 17 00:00:00 2001
From: Aven <Aven@ocr.one>
Date: Sun, 4 Aug 2024 05:04:03 +0800
Subject: [PATCH 13/17] fix  compilation errors caused by namespace related:
 #25199

---
 modules/dnn/src/layers/elementwise_layers.cpp  | 2 +-
 modules/dnn/src/layers/nary_eltwise_layers.cpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/modules/dnn/src/layers/elementwise_layers.cpp b/modules/dnn/src/layers/elementwise_layers.cpp
index 6c06554d5f..770939710d 100644
--- a/modules/dnn/src/layers/elementwise_layers.cpp
+++ b/modules/dnn/src/layers/elementwise_layers.cpp
@@ -932,7 +932,7 @@ struct GeluFunctor : public BaseFunctor {
 #endif
 
 #ifdef HAVE_DNN_NGRAPH
-    std::shared_ptr<ngraph::Node> initNgraphAPI(const ngraph::Output<ngraph::Node>& node)
+    std::shared_ptr<ov::Node> initNgraphAPI(const ov::Output<ov::Node>& node)
     {
         return std::make_shared<ov::op::v0::Gelu>(node);
     }
diff --git a/modules/dnn/src/layers/nary_eltwise_layers.cpp b/modules/dnn/src/layers/nary_eltwise_layers.cpp
index 659e7e29a8..305070f9b8 100644
--- a/modules/dnn/src/layers/nary_eltwise_layers.cpp
+++ b/modules/dnn/src/layers/nary_eltwise_layers.cpp
@@ -1006,7 +1006,7 @@ public:
         // In case only one input
         if (inputs.size() == 1) {
             auto &ieInpNode = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
-            ngraph::OutputVector inp{ieInpNode};
+            ov::OutputVector inp{ieInpNode};
             auto blank = std::make_shared<ov::op::v0::Concat>(inp, 0);
             return Ptr<BackendNode>(new InfEngineNgraphNode(blank));
         }

From a15cd4b63dba4b1286bfc6b23e4fe99a77d2620e Mon Sep 17 00:00:00 2001
From: Alexander Smorkalov <alexander.smorkalov@xperience.ai>
Date: Thu, 1 Aug 2024 10:57:19 +0300
Subject: [PATCH 14/17] Set and check allocator pointer for all cv::Mat
 instances.

---
 modules/core/src/matrix.cpp          |   6 +-
 modules/core/test/test_allocator.cpp | 144 +++++++++++++++++++++++++++
 2 files changed, 146 insertions(+), 4 deletions(-)
 create mode 100644 modules/core/test/test_allocator.cpp

diff --git a/modules/core/src/matrix.cpp b/modules/core/src/matrix.cpp
index 1b11e12145..f05711bba8 100644
--- a/modules/core/src/matrix.cpp
+++ b/modules/core/src/matrix.cpp
@@ -692,16 +692,13 @@ void Mat::create(int d, const int* _sizes, int _type)
     if( total() > 0 )
     {
         MatAllocator *a = allocator, *a0 = getDefaultAllocator();
-#ifdef HAVE_TGPU
-        if( !a || a == tegra::getAllocator() )
-            a = tegra::getAllocator(d, _sizes, _type);
-#endif
         if(!a)
             a = a0;
         try
         {
             u = a->allocate(dims, size, _type, 0, step.p, ACCESS_RW /* ignored */, USAGE_DEFAULT);
             CV_Assert(u != 0);
+            allocator = a;
         }
         catch (...)
         {
@@ -709,6 +706,7 @@ void Mat::create(int d, const int* _sizes, int _type)
                 throw;
             u = a0->allocate(dims, size, _type, 0, step.p, ACCESS_RW /* ignored */, USAGE_DEFAULT);
             CV_Assert(u != 0);
+            allocator = a0;
         }
         CV_Assert( step[dims-1] == (size_t)CV_ELEM_SIZE(flags) );
     }
diff --git a/modules/core/test/test_allocator.cpp b/modules/core/test/test_allocator.cpp
new file mode 100644
index 0000000000..88b03b689e
--- /dev/null
+++ b/modules/core/test/test_allocator.cpp
@@ -0,0 +1,144 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+#include "test_precomp.hpp"
+
+namespace opencv_test { namespace {
+
+// Dummy allocator implementation copied from the default OpenCV allocator with some simplifications
+struct DummyAllocator: public cv::MatAllocator
+{
+public:
+    DummyAllocator() {};
+    ~DummyAllocator() {};
+
+    cv::UMatData* allocate(int dims, const int* sizes, int type,
+                    void* data0, size_t* step, cv::AccessFlag flags,
+                    cv::UMatUsageFlags usageFlags) const
+    {
+        CV_UNUSED(flags);
+        CV_UNUSED(usageFlags);
+
+        size_t total = CV_ELEM_SIZE(type);
+        for( int i = dims-1; i >= 0; i-- )
+        {
+            if( step )
+            {
+                if( data0 && step[i] != CV_AUTOSTEP )
+                {
+                    CV_Assert(total <= step[i]);
+                    total = step[i];
+                }
+                else
+                    step[i] = total;
+            }
+            total *= sizes[i];
+        }
+
+        uchar* data = nullptr;
+        if (data0)
+        {
+            data = (uchar*)data0;
+        }
+        else
+        {
+            data = new uchar[total];
+            DummyAllocator::allocatedBytes += total;
+            DummyAllocator::allocations++;
+        }
+        cv::UMatData* u = new cv::UMatData(this);
+        u->data = u->origdata = data;
+        u->size = total;
+        if(data0)
+            u->flags |= cv::UMatData::USER_ALLOCATED;
+
+        return u;
+    }
+
+    bool allocate(cv::UMatData* u, cv::AccessFlag accessFlags, cv::UMatUsageFlags usageFlags) const
+    {
+        CV_UNUSED(accessFlags);
+        CV_UNUSED(usageFlags);
+
+        if(!u) return false;
+        return true;
+    }
+
+    void deallocate(cv::UMatData* u) const
+    {
+        if(!u)
+            return;
+
+        CV_Assert(u->urefcount == 0);
+        CV_Assert(u->refcount == 0);
+        if( !(u->flags & cv::UMatData::USER_ALLOCATED) )
+        {
+            delete[] u->origdata;
+            DummyAllocator::deallocations++;
+            u->origdata = 0;
+        }
+        delete u;
+    }
+
+    static size_t allocatedBytes;
+    static int allocations;
+    static int deallocations;
+};
+
+size_t DummyAllocator::allocatedBytes = 0;
+int  DummyAllocator::allocations = 0;
+int  DummyAllocator::deallocations = 0;
+
+cv::MatAllocator* getDummyAllocator()
+{
+    static cv::MatAllocator* allocator = new DummyAllocator;
+    return allocator;
+}
+
+struct AllocatorTest : public testing::Test {
+    void SetUp() override {
+        cv::MatAllocator* allocator = getDummyAllocator();
+        EXPECT_TRUE(allocator != nullptr);
+        cv::Mat::setDefaultAllocator(allocator);
+    }
+
+    void TearDown() override {
+        cv::Mat::setDefaultAllocator(cv::Mat::getStdAllocator());
+    }
+};
+
+TEST_F(AllocatorTest, DummyAllocator)
+{
+    cv::MatAllocator* dummy = getDummyAllocator();
+
+    DummyAllocator::allocatedBytes = 0;
+    DummyAllocator::allocations = 0;
+    DummyAllocator::deallocations = 0;
+
+    {
+        cv::Mat src1 = cv::Mat::ones (16, 16, CV_8UC1);
+        EXPECT_TRUE(!src1.empty());
+        EXPECT_EQ(src1.allocator, dummy);
+
+        cv::Mat src1_roi = src1(cv::Rect(2,2,8,8));
+        EXPECT_EQ(src1_roi.allocator, dummy);
+
+        cv::MatAllocator* standard = cv::Mat::getStdAllocator();
+        cv::Mat::setDefaultAllocator(standard);
+        cv::Mat src2 = cv::Mat::ones (16, 16, CV_8UC1);
+        EXPECT_TRUE(!src2.empty());
+        EXPECT_EQ(src2.allocator, standard);
+
+        src1.create(32, 32, CV_8UC1);
+        EXPECT_EQ(src1.allocator, dummy);
+    }
+
+    size_t expect_allocated = 16*16*sizeof(uchar) + 32*32*sizeof(uchar);
+    EXPECT_EQ(expect_allocated, DummyAllocator::allocatedBytes);
+
+    // ROI should not trigger extra allocations
+    EXPECT_EQ(2, DummyAllocator::allocations);
+    EXPECT_EQ(2, DummyAllocator::deallocations);
+}
+
+}} // namespace

From 6ed603e917424e0cb10e3d019706db1b0ff1e8af Mon Sep 17 00:00:00 2001
From: Maxim Smolskiy <mithridatus@mail.ru>
Date: Mon, 5 Aug 2024 13:28:07 +0300
Subject: [PATCH 15/17] Merge pull request #25991 from
 MaximSmolskiy:improve-corners-matching-in-ChessBoardDetector-NeighborsFinder-findCornerNeighbor
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Improve corners matching in ChessBoardDetector::NeighborsFinder::findCornerNeighbor #25991

### Pull Request Readiness Checklist

Idea was mentioned in `Section III-B. New Heuristic for Quadrangle Linking` of `Rufli, Martin & Scaramuzza, Davide & Siegwart, Roland. (2008). Automatic Detection of Checkerboards on Blurred and Distorted Images. 2008 IEEE/RSJ International Conference on Intelligent Robots and Systems, IROS. 3121-3126. 10.1109/IROS.2008.4650703` (https://rpg.ifi.uzh.ch/docs/IROS08_scaramuzza_b.pdf):
![Снимок экрана от 2024-08-05 09-51-27](https://github.com/user-attachments/assets/7a090ccc-c24c-4dfb-b0dd-259c8709eb72)
```
* For each candidate pair, focus on the quadrangles they belong to and draw two straight lines passing through the midsections of the respective quadrangle edges (see Fig. 6).
* If the candidate corner and the source corner are on the same side of every of the four straight lines drawn this way (this corresponds to the yellow shaded area in Fig. 6), then the corners are successfully matched.
```

By improving corners matching, we can increase the search radius (`thresh_scale`).

I tested this PR with benchmark
```
python3 objdetect_benchmark.py --configuration=generate_run --board_x=7 --path=res_chessboard --synthetic_object=chessboard
```
PR increases detected chessboards number by `3/7%`:
```
cell_img_size = 100 (default)

before
                                 category  detected chessboard  total detected chessboard  total chessboard  average detected error chessboard
                                      all             0.910417                      13110             14400                           0.599746
Total detected time:  147.50906700000002 sec

after
                                 category  detected chessboard  total detected chessboard  total chessboard  average detected error chessboard
                                      all             0.941667                      13560             14400                           0.596726
Total detected time:  136.68963200000007 sec

----------------------------------------------------------------------------------------------------------------------------------------------

cell_img_size = 10

before
                                 category  detected chessboard  total detected chessboard  total chessboard  average detected error chessboard
                                      all             0.539792                       7773             14400                           4.208237
Total detected time:  2.668964 sec

after
                                 category  detected chessboard  total detected chessboard  total chessboard  average detected error chessboard
                                      all             0.579167                       8340             14400                           4.198448
Total detected time:  2.535998999999999 sec
```

See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request

- [x] I agree to contribute to the project under Apache 2 License.
- [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [x] The PR is proposed to the proper branch
- [x] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
      Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
---
 modules/calib3d/src/calibinit.cpp | 56 +++++++++++++++++++++++++------
 1 file changed, 46 insertions(+), 10 deletions(-)

diff --git a/modules/calib3d/src/calibinit.cpp b/modules/calib3d/src/calibinit.cpp
index 1767c38981..eb6b87ce7d 100644
--- a/modules/calib3d/src/calibinit.cpp
+++ b/modules/calib3d/src/calibinit.cpp
@@ -222,7 +222,7 @@ public:
     int all_quads_count;
 
     struct NeighborsFinder {
-        const float thresh_scale = 1.f;
+        const float thresh_scale = sqrt(2.f);
         ChessBoardDetector& detector;
         std::vector<int> neighbors_indices;
         std::vector<float> neighbors_dists;
@@ -232,8 +232,9 @@ public:
         NeighborsFinder(ChessBoardDetector& detector);
 
         bool findCornerNeighbor(
-            const int idx,
-            const cv::Point2f& pt,
+            const int quad_idx,
+            const int corner_idx,
+            const cv::Point2f& corner_pt,
             float& min_dist,
             const float radius,
             int& closest_quad_idx,
@@ -514,9 +515,23 @@ ChessBoardDetector::NeighborsFinder::NeighborsFinder(ChessBoardDetector& _detect
     neighbors_dists.resize(all_corners_count);
 }
 
+static double pointSideFromLine(const Point2f& line_direction_vector, const Point2f& vector)
+{
+    return line_direction_vector.cross(vector);
+}
+
+static bool arePointsOnSameSideFromLine(const Point2f& line_pt1, const Point2f& line_pt2, const Point2f& pt1, const Point2f& pt2)
+{
+    const Point2f line_direction_vector = line_pt2 - line_pt1;
+    const Point2f vector1 = pt1 - line_pt1;
+    const Point2f vector2 = pt2 - line_pt1;
+    return pointSideFromLine(line_direction_vector, vector1) * pointSideFromLine(line_direction_vector, vector2) > 0.;
+}
+
 bool ChessBoardDetector::NeighborsFinder::findCornerNeighbor(
-    const int idx,
-    const cv::Point2f& pt,
+    const int quad_idx,
+    const int corner_idx,
+    const cv::Point2f& corner_pt,
     float& min_dist,
     const float radius,
     int& closest_quad_idx,
@@ -525,12 +540,12 @@ bool ChessBoardDetector::NeighborsFinder::findCornerNeighbor(
 {
     ChessBoardQuad* p_all_quads = detector.all_quads.data();
 
-    const ChessBoardQuad& cur_quad = (const ChessBoardQuad&)p_all_quads[idx];
+    const ChessBoardQuad& cur_quad = (const ChessBoardQuad&)p_all_quads[quad_idx];
     int closest_neighbor_idx = -1;
     ChessBoardQuad *closest_quad = 0;
 
     // find the closest corner in all other quadrangles
-    const std::vector<float> query = { pt.x, pt.y };
+    const std::vector<float> query = { corner_pt.x, corner_pt.y };
     const cvflann::SearchParams search_params(-1);
     const int neighbors_count = all_quads_pts_index.radiusSearch(query, neighbors_indices, neighbors_dists, radius, search_params);
 
@@ -538,7 +553,7 @@ bool ChessBoardDetector::NeighborsFinder::findCornerNeighbor(
     {
         const int neighbor_idx = neighbors_indices[neighbor_idx_idx];
         const int k = neighbor_idx >> 2;
-        if (k == idx)
+        if (k == quad_idx)
             continue;
 
         ChessBoardQuad& q_k = p_all_quads[k];
@@ -546,7 +561,8 @@ bool ChessBoardDetector::NeighborsFinder::findCornerNeighbor(
         if (q_k.neighbors[j])
             continue;
 
-        const float dist = normL2Sqr<float>(pt - all_quads_pts[neighbor_idx]);
+        const Point2f neighbor_pt = all_quads_pts[neighbor_idx];
+        const float dist = normL2Sqr<float>(corner_pt - neighbor_pt);
         if (dist <= cur_quad.edge_len * thresh_scale &&
             dist <= q_k.edge_len * thresh_scale)
         {
@@ -560,6 +576,24 @@ bool ChessBoardDetector::NeighborsFinder::findCornerNeighbor(
                 DPRINTF("Incompatible edge lengths");
                 continue;
             }
+
+            const Point2f mid_pt1 = (cur_quad.corners[corner_idx]->pt + cur_quad.corners[(corner_idx + 1) & 3]->pt) / 2.f;
+            const Point2f mid_pt2 = (cur_quad.corners[(corner_idx + 2) & 3]->pt + cur_quad.corners[(corner_idx + 3) & 3]->pt) / 2.f;
+            if (!arePointsOnSameSideFromLine(mid_pt1, mid_pt2, corner_pt, neighbor_pt))
+                continue;
+
+            const Point2f mid_pt3 = (cur_quad.corners[(corner_idx + 1) & 3]->pt + cur_quad.corners[(corner_idx + 2) & 3]->pt) / 2.f;
+            const Point2f mid_pt4 = (cur_quad.corners[(corner_idx + 3) & 3]->pt + cur_quad.corners[corner_idx]->pt) / 2.f;
+            if (!arePointsOnSameSideFromLine(mid_pt3, mid_pt4, corner_pt, neighbor_pt))
+                continue;
+
+            const Point2f neighbor_pt_diagonal = q_k.corners[(j + 2) & 3]->pt;
+            if (!arePointsOnSameSideFromLine(mid_pt1, mid_pt2, corner_pt, neighbor_pt_diagonal))
+                continue;
+
+            if (!arePointsOnSameSideFromLine(mid_pt3, mid_pt4, neighbor_pt, neighbor_pt_diagonal))
+                continue;
+
             closest_neighbor_idx = neighbor_idx;
             closest_quad_idx = k;
             closest_corner_idx = j;
@@ -589,7 +623,7 @@ bool ChessBoardDetector::NeighborsFinder::findCornerNeighbor(
             if (cur_quad.neighbors[j] == closest_quad)
                 break;
 
-            if (normL2Sqr<float>(closest_corner_pt - all_quads_pts[(idx << 2) + j]) < min_dist)
+            if (normL2Sqr<float>(closest_corner_pt - all_quads_pts[(quad_idx << 2) + j]) < min_dist)
                 break;
         }
         if (j < 4)
@@ -1793,6 +1827,7 @@ void ChessBoardDetector::findQuadNeighbors()
 
             bool found = neighborsFinder.findCornerNeighbor(
                 idx,
+                i,
                 pt,
                 min_dist,
                 radius,
@@ -1813,6 +1848,7 @@ void ChessBoardDetector::findQuadNeighbors()
 
             found = neighborsFinder.findCornerNeighbor(
                 closest_quad_idx,
+                closest_corner_idx,
                 closest_corner_pt,
                 min_dist,
                 radius,

From 75fca7d9d0e4b7fa796d36d4c4ad98f5d210a9b5 Mon Sep 17 00:00:00 2001
From: Alexander Smorkalov <alexander.smorkalov@xperience.ai>
Date: Tue, 23 Jul 2024 15:42:21 +0300
Subject: [PATCH 16/17] Added fisheye::distort with non-identity projection
 matrix.

---
 modules/calib3d/include/opencv2/calib3d.hpp | 17 +++-
 modules/calib3d/src/fisheye.cpp             | 42 ++++++++++
 modules/calib3d/test/test_fisheye.cpp       | 90 ++++++++++++++++++++-
 3 files changed, 147 insertions(+), 2 deletions(-)

diff --git a/modules/calib3d/include/opencv2/calib3d.hpp b/modules/calib3d/include/opencv2/calib3d.hpp
index aedbaf930a..9fc6773450 100644
--- a/modules/calib3d/include/opencv2/calib3d.hpp
+++ b/modules/calib3d/include/opencv2/calib3d.hpp
@@ -3835,10 +3835,25 @@ namespace fisheye
     @param distorted Output array of image points, 1xN/Nx1 2-channel, or vector\<Point2f\> .
 
     Note that the function assumes the camera intrinsic matrix of the undistorted points to be identity.
-    This means if you want to distort image points you have to multiply them with \f$K^{-1}\f$.
+    This means if you want to distort image points you have to multiply them with \f$K^{-1}\f$ or
+    use another function overload.
      */
     CV_EXPORTS_W void distortPoints(InputArray undistorted, OutputArray distorted, InputArray K, InputArray D, double alpha = 0);
 
+    /** @overload
+    Overload of distortPoints function to handle cases when undistorted points are got with non-identity
+    camera matrix, e.g. output of #estimateNewCameraMatrixForUndistortRectify.
+    @param undistorted Array of object points, 1xN/Nx1 2-channel (or vector\<Point2f\> ), where N is
+    the number of points in the view.
+    @param Kundistorted Camera intrinsic matrix used as new camera matrix for undistortion.
+    @param K Camera intrinsic matrix \f$cameramatrix{K}\f$.
+    @param D Input vector of distortion coefficients \f$\distcoeffsfisheye\f$.
+    @param alpha The skew coefficient.
+    @param distorted Output array of image points, 1xN/Nx1 2-channel, or vector\<Point2f\> .
+    @sa estimateNewCameraMatrixForUndistortRectify
+    */
+    CV_EXPORTS_W void distortPoints(InputArray undistorted, OutputArray distorted, InputArray Kundistorted, InputArray K, InputArray D, double alpha = 0);
+
     /** @brief Undistorts 2D points using fisheye model
 
     @param distorted Array of object points, 1xN/Nx1 2-channel (or vector\<Point2f\> ), where N is the
diff --git a/modules/calib3d/src/fisheye.cpp b/modules/calib3d/src/fisheye.cpp
index 751a1aa6da..4aec4324e0 100644
--- a/modules/calib3d/src/fisheye.cpp
+++ b/modules/calib3d/src/fisheye.cpp
@@ -315,6 +315,48 @@ void cv::fisheye::distortPoints(InputArray undistorted, OutputArray distorted, I
     }
 }
 
+void cv::fisheye::distortPoints(InputArray _undistorted, OutputArray distorted, InputArray Kundistorted, InputArray K, InputArray D, double alpha)
+{
+    CV_INSTRUMENT_REGION();
+
+    CV_Assert(_undistorted.type() == CV_32FC2 || _undistorted.type() == CV_64FC2);
+    CV_Assert(Kundistorted.size() == Size(3,3) && (Kundistorted.type() == CV_32F || Kundistorted.type() == CV_64F));
+
+    cv::Mat undistorted = _undistorted.getMat();
+    cv::Mat normalized(undistorted.size(), CV_64FC2);
+
+    Mat Knew = Kundistorted.getMat();
+
+    double cx, cy, fx, fy;
+    if (Knew.depth() == CV_32F)
+    {
+        fx = (double)Knew.at<float>(0, 0);
+        fy = (double)Knew.at<float>(1, 1);
+        cx = (double)Knew.at<float>(0, 2);
+        cy = (double)Knew.at<float>(1, 2);
+    }
+    else
+    {
+        fx = Knew.at<double>(0, 0);
+        fy = Knew.at<double>(1, 1);
+        cx = Knew.at<double>(0, 2);
+        cy = Knew.at<double>(1, 2);
+    }
+
+    size_t n = undistorted.total();
+    const Vec2f* Xf = undistorted.ptr<Vec2f>();
+    const Vec2d* Xd = undistorted.ptr<Vec2d>();
+    Vec2d* normXd = normalized.ptr<Vec2d>();
+    for (size_t i = 0; i < n; i++)
+    {
+        Vec2d p = undistorted.depth() == CV_32F ? (Vec2d)Xf[i] : Xd[i];
+        normXd[i][0] = (p[0] - cx) / fx;
+        normXd[i][1] = (p[1] - cy) / fy;
+    }
+
+    cv::fisheye::distortPoints(normalized, distorted, K, D, alpha);
+}
+
 //////////////////////////////////////////////////////////////////////////////////////////////////////////////
 /// cv::fisheye::undistortPoints
 
diff --git a/modules/calib3d/test/test_fisheye.cpp b/modules/calib3d/test/test_fisheye.cpp
index 36b7d0d653..d7368c3190 100644
--- a/modules/calib3d/test/test_fisheye.cpp
+++ b/modules/calib3d/test/test_fisheye.cpp
@@ -107,7 +107,6 @@ TEST_F(fisheyeTest, distortUndistortPoints)
     int height = imageSize.height;
 
     /* Create test points */
-    std::vector<cv::Point2d> points0Vector;
     cv::Mat principalPoints = (cv::Mat_<double>(5, 2) << K(0, 2), K(1, 2), // (cx, cy)
                                                                     /* Image corners */
                                                                     0, 0,
@@ -150,6 +149,95 @@ TEST_F(fisheyeTest, distortUndistortPoints)
     }
 }
 
+TEST_F(fisheyeTest, distortUndistortPointsNewCameraFixed)
+{
+    int width = imageSize.width;
+    int height = imageSize.height;
+
+    /* Random points inside image */
+    cv::Mat xy[2] = {};
+    xy[0].create(100, 1, CV_64F);
+    theRNG().fill(xy[0], cv::RNG::UNIFORM, 0, width); // x
+    xy[1].create(100, 1, CV_64F);
+    theRNG().fill(xy[1], cv::RNG::UNIFORM, 0, height); // y
+
+    cv::Mat randomPoints;
+    merge(xy, 2, randomPoints);
+
+    cv::Mat points0 = randomPoints;
+    cv::Mat Reye = cv::Mat::eye(3, 3, CV_64FC1);
+
+    cv::Mat Knew;
+    cv::fisheye::estimateNewCameraMatrixForUndistortRectify(K, D, imageSize, Reye,  Knew);
+
+    /* Distort -> Undistort */
+    cv::Mat distortedPoints;
+    cv::fisheye::distortPoints(points0, distortedPoints, Knew, K, D);
+    cv::Mat undistortedPoints;
+    cv::fisheye::undistortPoints(distortedPoints, undistortedPoints, K, D, Reye, Knew);
+
+    EXPECT_MAT_NEAR(points0, undistortedPoints, 1e-8);
+
+    /* Undistort -> Distort */
+    cv::fisheye::undistortPoints(points0, undistortedPoints, K, D, Reye, Knew);
+    cv::fisheye::distortPoints(undistortedPoints, distortedPoints, Knew, K, D);
+
+    EXPECT_MAT_NEAR(points0, distortedPoints, 1e-8);
+}
+
+TEST_F(fisheyeTest, distortUndistortPointsNewCameraRandom)
+{
+    int width = imageSize.width;
+    int height = imageSize.height;
+
+    /* Create test points */
+    std::vector<cv::Point2d> points0Vector;
+    cv::Mat principalPoints = (cv::Mat_<double>(5, 2) << K(0, 2), K(1, 2), // (cx, cy)
+                                                                    /* Image corners */
+                                                                    0, 0,
+                                                                    0, height,
+                                                                    width, 0,
+                                                                    width, height
+                                                                    );
+
+    /* Random points inside image */
+    cv::Mat xy[2] = {};
+    xy[0].create(100, 1, CV_64F);
+    theRNG().fill(xy[0], cv::RNG::UNIFORM, 0, width); // x
+    xy[1].create(100, 1, CV_64F);
+    theRNG().fill(xy[1], cv::RNG::UNIFORM, 0, height); // y
+
+    cv::Mat randomPoints;
+    merge(xy, 2, randomPoints);
+
+    cv::Mat points0;
+    cv::Mat Reye = cv::Mat::eye(3, 3, CV_64FC1);
+    cv::vconcat(principalPoints.reshape(2), randomPoints, points0);
+
+    /* Test with random D set */
+    for (size_t i = 0; i < 10; ++i) {
+        cv::Mat distortion(1, 4, CV_64F);
+        theRNG().fill(distortion, cv::RNG::UNIFORM, -0.001, 0.001);
+
+        cv::Mat Knew;
+        cv::fisheye::estimateNewCameraMatrixForUndistortRectify(K, distortion, imageSize, Reye,  Knew);
+
+        /* Distort -> Undistort */
+        cv::Mat distortedPoints;
+        cv::fisheye::distortPoints(points0, distortedPoints, Knew, K, distortion);
+        cv::Mat undistortedPoints;
+        cv::fisheye::undistortPoints(distortedPoints, undistortedPoints, K, distortion, Reye, Knew);
+
+        EXPECT_MAT_NEAR(points0, undistortedPoints, 1e-8);
+
+        /* Undistort -> Distort */
+        cv::fisheye::undistortPoints(points0, undistortedPoints, K, distortion, Reye, Knew);
+        cv::fisheye::distortPoints(undistortedPoints, distortedPoints, Knew, K, distortion);
+
+        EXPECT_MAT_NEAR(points0, distortedPoints, 1e-8);
+    }
+}
+
 TEST_F(fisheyeTest, solvePnP)
 {
     const int n = 16;

From 49459d46e2234363af81d1692511ae8cc0c256f9 Mon Sep 17 00:00:00 2001
From: Alexander Smorkalov <2536374+asmorkalov@users.noreply.github.com>
Date: Tue, 6 Aug 2024 11:40:58 +0300
Subject: [PATCH 17/17] Merge pull request #25932 from
 asmorkalov:as/HAL_cvtColor_aprox

Added xxxApprox overloads for YUV color conversions in HAL and AlgorithmHint to cvtColor #25932

The xxxApprox to implement HAL functions with less bits for arithmetic of FP.

The hint was introduced in #25792 and #25911

### Pull Request Readiness Checklist

See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request

- [x] I agree to contribute to the project under Apache 2 License.
- [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [x] The PR is proposed to the proper branch
- [x] There is a reference to the original bug report and related work
- [x] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
      Patch to opencv_extra has the same branch name.
- [x] The feature is well documented and sample code can be built with the project CMake
---
 modules/imgproc/include/opencv2/imgproc.hpp |   6 +-
 modules/imgproc/src/color.cpp               |  23 +--
 modules/imgproc/src/color.hpp               |  18 +--
 modules/imgproc/src/color_yuv.dispatch.cpp  | 152 +++++++++++---------
 modules/imgproc/src/hal_replacement.hpp     | 148 +++++++++++++++++++
 modules/imgproc/test/test_color.cpp         |   6 +-
 6 files changed, 265 insertions(+), 88 deletions(-)

diff --git a/modules/imgproc/include/opencv2/imgproc.hpp b/modules/imgproc/include/opencv2/imgproc.hpp
index 4456b3a88c..eb92e8cc21 100644
--- a/modules/imgproc/include/opencv2/imgproc.hpp
+++ b/modules/imgproc/include/opencv2/imgproc.hpp
@@ -3726,10 +3726,11 @@ floating-point.
 @param code color space conversion code (see #ColorConversionCodes).
 @param dstCn number of channels in the destination image; if the parameter is 0, the number of the
 channels is derived automatically from src and code.
+@param hint Implementation modfication flags. See #AlgorithmHint
 
 @see @ref imgproc_color_conversions
  */
-CV_EXPORTS_W void cvtColor( InputArray src, OutputArray dst, int code, int dstCn = 0 );
+CV_EXPORTS_W void cvtColor( InputArray src, OutputArray dst, int code, int dstCn = 0, AlgorithmHint hint = cv::ALGO_HINT_DEFAULT );
 
 /** @brief Converts an image from one color space to another where the source image is
 stored in two planes.
@@ -3748,8 +3749,9 @@ This function only supports YUV420 to RGB conversion as of now.
 - #COLOR_YUV2RGB_NV21
 - #COLOR_YUV2BGRA_NV21
 - #COLOR_YUV2RGBA_NV21
+@param hint Implementation modfication flags. See #AlgorithmHint
 */
-CV_EXPORTS_W void cvtColorTwoPlane( InputArray src1, InputArray src2, OutputArray dst, int code );
+CV_EXPORTS_W void cvtColorTwoPlane( InputArray src1, InputArray src2, OutputArray dst, int code, AlgorithmHint hint = cv::ALGO_HINT_DEFAULT );
 
 /** @brief main function for all demosaicing processes
 
diff --git a/modules/imgproc/src/color.cpp b/modules/imgproc/src/color.cpp
index dde8e1344c..703511b9cf 100644
--- a/modules/imgproc/src/color.cpp
+++ b/modules/imgproc/src/color.cpp
@@ -168,7 +168,7 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
 
 // helper function for dual-plane modes
 
-void cvtColorTwoPlane( InputArray _ysrc, InputArray _uvsrc, OutputArray _dst, int code )
+void cvtColorTwoPlane( InputArray _ysrc, InputArray _uvsrc, OutputArray _dst, int code, AlgorithmHint hint )
 {
     // only YUV420 is currently supported
     switch (code)
@@ -181,7 +181,7 @@ void cvtColorTwoPlane( InputArray _ysrc, InputArray _uvsrc, OutputArray _dst, in
             return;
     }
 
-    cvtColorTwoPlaneYUV2BGRpair(_ysrc, _uvsrc, _dst, dstChannels(code), swapBlue(code), uIndex(code));
+    cvtColorTwoPlaneYUV2BGRpair(_ysrc, _uvsrc, _dst, hint, dstChannels(code), swapBlue(code), uIndex(code));
 }
 
 
@@ -189,10 +189,13 @@ void cvtColorTwoPlane( InputArray _ysrc, InputArray _uvsrc, OutputArray _dst, in
 //                                   The main function                                  //
 //////////////////////////////////////////////////////////////////////////////////////////
 
-void cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
+void cvtColor( InputArray _src, OutputArray _dst, int code, int dcn, AlgorithmHint hint)
 {
     CV_INSTRUMENT_REGION();
 
+    if (hint == cv::ALGO_HINT_DEFAULT)
+        hint = cv::getDefaultAlgorithmHint();
+
     CV_Assert(!_src.empty());
 
     if(dcn <= 0)
@@ -244,12 +247,12 @@ void cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
 
         case COLOR_BGR2YCrCb: case COLOR_RGB2YCrCb:
         case COLOR_BGR2YUV:   case COLOR_RGB2YUV:
-            cvtColorBGR2YUV(_src, _dst, swapBlue(code), code == COLOR_BGR2YCrCb || code == COLOR_RGB2YCrCb);
+            cvtColorBGR2YUV(_src, _dst, hint, swapBlue(code), code == COLOR_BGR2YCrCb || code == COLOR_RGB2YCrCb);
             break;
 
         case COLOR_YCrCb2BGR: case COLOR_YCrCb2RGB:
         case COLOR_YUV2BGR:   case COLOR_YUV2RGB:
-            cvtColorYUV2BGR(_src, _dst, dcn, swapBlue(code), code == COLOR_YCrCb2BGR || code == COLOR_YCrCb2RGB);
+            cvtColorYUV2BGR(_src, _dst, hint, dcn, swapBlue(code), code == COLOR_YCrCb2BGR || code == COLOR_YCrCb2RGB);
             break;
 
         case COLOR_BGR2XYZ:
@@ -321,14 +324,14 @@ void cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
         case COLOR_YUV2BGRA_NV21: case COLOR_YUV2RGBA_NV21: case COLOR_YUV2BGRA_NV12: case COLOR_YUV2RGBA_NV12:
             // http://www.fourcc.org/yuv.php#NV21 == yuv420sp -> a plane of 8 bit Y samples followed by an interleaved V/U plane containing 8 bit 2x2 subsampled chroma samples
             // http://www.fourcc.org/yuv.php#NV12 -> a plane of 8 bit Y samples followed by an interleaved U/V plane containing 8 bit 2x2 subsampled colour difference samples
-            cvtColorTwoPlaneYUV2BGR(_src, _dst, dcn, swapBlue(code), uIndex(code));
+            cvtColorTwoPlaneYUV2BGR(_src, _dst, hint, dcn, swapBlue(code), uIndex(code));
             break;
 
         case COLOR_YUV2BGR_YV12: case COLOR_YUV2RGB_YV12: case COLOR_YUV2BGRA_YV12: case COLOR_YUV2RGBA_YV12:
         case COLOR_YUV2BGR_IYUV: case COLOR_YUV2RGB_IYUV: case COLOR_YUV2BGRA_IYUV: case COLOR_YUV2RGBA_IYUV:
             //http://www.fourcc.org/yuv.php#YV12 == yuv420p -> It comprises an NxM Y plane followed by (N/2)x(M/2) V and U planes.
             //http://www.fourcc.org/yuv.php#IYUV == I420 -> It comprises an NxN Y plane followed by (N/2)x(N/2) U and V planes
-            cvtColorThreePlaneYUV2BGR(_src, _dst, dcn, swapBlue(code), uIndex(code));
+            cvtColorThreePlaneYUV2BGR(_src, _dst, hint, dcn, swapBlue(code), uIndex(code));
             break;
 
         case COLOR_YUV2GRAY_420:
@@ -337,7 +340,7 @@ void cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
 
         case COLOR_RGB2YUV_YV12: case COLOR_BGR2YUV_YV12: case COLOR_RGBA2YUV_YV12: case COLOR_BGRA2YUV_YV12:
         case COLOR_RGB2YUV_IYUV: case COLOR_BGR2YUV_IYUV: case COLOR_RGBA2YUV_IYUV: case COLOR_BGRA2YUV_IYUV:
-            cvtColorBGR2ThreePlaneYUV(_src, _dst, swapBlue(code), uIndex(code));
+            cvtColorBGR2ThreePlaneYUV(_src, _dst, hint, swapBlue(code), uIndex(code));
             break;
 
         case COLOR_YUV2RGB_UYVY: case COLOR_YUV2BGR_UYVY: case COLOR_YUV2RGBA_UYVY: case COLOR_YUV2BGRA_UYVY:
@@ -349,7 +352,7 @@ void cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
             {
                 int ycn  = (code==COLOR_YUV2RGB_UYVY || code==COLOR_YUV2BGR_UYVY ||
                             code==COLOR_YUV2RGBA_UYVY || code==COLOR_YUV2BGRA_UYVY) ? 1 : 0;
-                cvtColorOnePlaneYUV2BGR(_src, _dst, dcn, swapBlue(code), uIndex(code), ycn);
+                cvtColorOnePlaneYUV2BGR(_src, _dst, hint, dcn, swapBlue(code), uIndex(code), ycn);
                 break;
             }
 
@@ -362,7 +365,7 @@ void cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
             {
                 int ycn  = (code==COLOR_RGB2YUV_UYVY ||  code==COLOR_BGR2YUV_UYVY ||
                             code==COLOR_RGBA2YUV_UYVY || code==COLOR_BGRA2YUV_UYVY) ? 1 : 0;
-                cvtColorOnePlaneBGR2YUV(_src, _dst, swapBlue(code), uIndex(code), ycn);
+                cvtColorOnePlaneBGR2YUV(_src, _dst, hint, swapBlue(code), uIndex(code), ycn);
                 break;
             }
 
diff --git a/modules/imgproc/src/color.hpp b/modules/imgproc/src/color.hpp
index 6ebca26a2c..883c9ccab4 100644
--- a/modules/imgproc/src/color.hpp
+++ b/modules/imgproc/src/color.hpp
@@ -556,15 +556,15 @@ void cvtColorLuv2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb, bo
 void cvtColorBGR2XYZ( InputArray _src, OutputArray _dst, bool swapb );
 void cvtColorXYZ2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb );
 
-void cvtColorBGR2YUV( InputArray _src, OutputArray _dst, bool swapb, bool crcb);
-void cvtColorYUV2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb, bool crcb);
-
-void cvtColorOnePlaneYUV2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb, int uidx, int ycn);
-void cvtColorOnePlaneBGR2YUV( InputArray _src, OutputArray _dst, bool swapb, int uidx, int ycn);
-void cvtColorTwoPlaneYUV2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb, int uidx );
-void cvtColorTwoPlaneYUV2BGRpair( InputArray _ysrc, InputArray _uvsrc, OutputArray _dst, int dcn, bool swapb, int uidx );
-void cvtColorThreePlaneYUV2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb, int uidx );
-void cvtColorBGR2ThreePlaneYUV( InputArray _src, OutputArray _dst, bool swapb, int uidx);
+void cvtColorBGR2YUV( InputArray _src, OutputArray _dst, AlgorithmHint hint, bool swapb, bool crcb);
+void cvtColorYUV2BGR( InputArray _src, OutputArray _dst, AlgorithmHint hint, int dcn, bool swapb, bool crcb);
+
+void cvtColorOnePlaneYUV2BGR( InputArray _src, OutputArray _dst, AlgorithmHint hint, int dcn, bool swapb, int uidx, int ycn );
+void cvtColorOnePlaneBGR2YUV( InputArray _src, OutputArray _dst, AlgorithmHint hint, bool swapb, int uidx, int ycn );
+void cvtColorTwoPlaneYUV2BGR( InputArray _src, OutputArray _dst, AlgorithmHint hint, int dcn, bool swapb, int uidx );
+void cvtColorTwoPlaneYUV2BGRpair( InputArray _ysrc, InputArray _uvsrc, OutputArray _dst, AlgorithmHint hint, int dcn, bool swapb, int uidx );
+void cvtColorThreePlaneYUV2BGR( InputArray _src, OutputArray _dst, AlgorithmHint hint, int dcn, bool swapb, int uidx );
+void cvtColorBGR2ThreePlaneYUV( InputArray _src, OutputArray _dst, AlgorithmHint hint, bool swapb, int uidx );
 void cvtColorYUV2Gray_420( InputArray _src, OutputArray _dst );
 void cvtColorYUV2Gray_ch( InputArray _src, OutputArray _dst, int coi );
 
diff --git a/modules/imgproc/src/color_yuv.dispatch.cpp b/modules/imgproc/src/color_yuv.dispatch.cpp
index 71d840d857..89e933affa 100644
--- a/modules/imgproc/src/color_yuv.dispatch.cpp
+++ b/modules/imgproc/src/color_yuv.dispatch.cpp
@@ -18,13 +18,18 @@ namespace cv {
 namespace hal {
 
 // 8u, 16u, 32f
-void cvtBGRtoYUV(const uchar * src_data, size_t src_step,
+static void cvtBGRtoYUV(const uchar * src_data, size_t src_step,
                  uchar * dst_data, size_t dst_step,
                  int width, int height,
-                 int depth, int scn, bool swapBlue, bool isCbCr)
+                 int depth, int scn, bool swapBlue, bool isCbCr, AlgorithmHint hint)
 {
     CV_INSTRUMENT_REGION();
 
+    if (hint == ALGO_HINT_APPROX)
+    {
+        CALL_HAL(cvtBGRtoYUV, cv_hal_cvtBGRtoYUVApprox, src_data, src_step, dst_data, dst_step, width, height, depth, scn, swapBlue, isCbCr);
+    }
+
     CALL_HAL(cvtBGRtoYUV, cv_hal_cvtBGRtoYUV, src_data, src_step, dst_data, dst_step, width, height, depth, scn, swapBlue, isCbCr);
 
 #if defined(HAVE_IPP)
@@ -66,13 +71,18 @@ void cvtBGRtoYUV(const uchar * src_data, size_t src_step,
         CV_CPU_DISPATCH_MODES_ALL);
 }
 
-void cvtYUVtoBGR(const uchar * src_data, size_t src_step,
+static void cvtYUVtoBGR(const uchar * src_data, size_t src_step,
                  uchar * dst_data, size_t dst_step,
                  int width, int height,
-                 int depth, int dcn, bool swapBlue, bool isCbCr)
+                 int depth, int dcn, bool swapBlue, bool isCbCr, AlgorithmHint hint)
 {
     CV_INSTRUMENT_REGION();
 
+    if (hint == ALGO_HINT_APPROX)
+    {
+        CALL_HAL(cvtYUVtoBGR, cv_hal_cvtYUVtoBGRApprox, src_data, src_step, dst_data, dst_step, width, height, depth, dcn, swapBlue, isCbCr);
+    }
+
     CALL_HAL(cvtYUVtoBGR, cv_hal_cvtYUVtoBGR, src_data, src_step, dst_data, dst_step, width, height, depth, dcn, swapBlue, isCbCr);
 
 
@@ -115,63 +125,79 @@ void cvtYUVtoBGR(const uchar * src_data, size_t src_step,
         CV_CPU_DISPATCH_MODES_ALL);
 }
 
-// 4:2:0, two planes in one array: Y, UV interleaved
+// 4:2:0, two planes: Y, UV interleaved
 // Y : [16, 235]; Cb, Cr: [16, 240] centered at 128
 // 20-bit fixed-point arithmetics
-void cvtTwoPlaneYUVtoBGR(const uchar * src_data, size_t src_step,
+static void cvtTwoPlaneYUVtoBGR(const uchar * y_data, size_t y_step, const uchar * uv_data, size_t uv_step,
                          uchar * dst_data, size_t dst_step,
                          int dst_width, int dst_height,
-                         int dcn, bool swapBlue, int uIdx)
+                         int dcn, bool swapBlue, int uIdx, AlgorithmHint hint)
 {
     CV_INSTRUMENT_REGION();
 
-    CALL_HAL(cvtTwoPlaneYUVtoBGR, cv_hal_cvtTwoPlaneYUVtoBGR, src_data, src_step, dst_data, dst_step, dst_width, dst_height, dcn, swapBlue, uIdx);
+    if (hint == ALGO_HINT_APPROX)
+    {
+            CALL_HAL(cvtTwoPlaneYUVtoBGREx, cv_hal_cvtTwoPlaneYUVtoBGRExApprox,
+                y_data, y_step, uv_data, uv_step, dst_data, dst_step, dst_width, dst_height, dcn, swapBlue, uIdx);
+    }
 
-    cvtTwoPlaneYUVtoBGR(
-            src_data, src_step, src_data + src_step * dst_height, src_step, dst_data, dst_step,
-            dst_width, dst_height, dcn, swapBlue, uIdx);
+    CALL_HAL(cvtTwoPlaneYUVtoBGREx, cv_hal_cvtTwoPlaneYUVtoBGREx,
+             y_data, y_step, uv_data, uv_step, dst_data, dst_step, dst_width, dst_height, dcn, swapBlue, uIdx);
+
+    CV_CPU_DISPATCH(cvtTwoPlaneYUVtoBGR, (y_data, y_step, uv_data, uv_step, dst_data, dst_step, dst_width, dst_height, dcn, swapBlue, uIdx),
+        CV_CPU_DISPATCH_MODES_ALL);
 }
 
-// 4:2:0, two planes: Y, UV interleaved
+// 4:2:0, two planes in one array: Y, UV interleaved
 // Y : [16, 235]; Cb, Cr: [16, 240] centered at 128
 // 20-bit fixed-point arithmetics
-void cvtTwoPlaneYUVtoBGR(const uchar * y_data, const uchar * uv_data, size_t src_step,
+static void cvtTwoPlaneYUVtoBGR(const uchar * src_data, size_t src_step,
                          uchar * dst_data, size_t dst_step,
                          int dst_width, int dst_height,
-                         int dcn, bool swapBlue, int uIdx)
+                         int dcn, bool swapBlue, int uIdx, AlgorithmHint hint)
 {
     CV_INSTRUMENT_REGION();
 
-    cvtTwoPlaneYUVtoBGR(y_data, src_step, uv_data, src_step, dst_data, dst_step, dst_width, dst_height, dcn, swapBlue, uIdx);
+    if (hint == ALGO_HINT_APPROX)
+    {
+        CALL_HAL(cvtTwoPlaneYUVtoBGR, cv_hal_cvtTwoPlaneYUVtoBGRApprox, src_data, src_step, dst_data, dst_step, dst_width, dst_height, dcn, swapBlue, uIdx);
+    }
+
+    CALL_HAL(cvtTwoPlaneYUVtoBGR, cv_hal_cvtTwoPlaneYUVtoBGR, src_data, src_step, dst_data, dst_step, dst_width, dst_height, dcn, swapBlue, uIdx);
+
+    cvtTwoPlaneYUVtoBGR(
+            src_data, src_step, src_data + src_step * dst_height, src_step, dst_data, dst_step,
+            dst_width, dst_height, dcn, swapBlue, uIdx, hint);
 }
 
 // 4:2:0, two planes: Y, UV interleaved
 // Y : [16, 235]; Cb, Cr: [16, 240] centered at 128
 // 20-bit fixed-point arithmetics
-void cvtTwoPlaneYUVtoBGR(const uchar * y_data, size_t y_step, const uchar * uv_data, size_t uv_step,
+static void cvtTwoPlaneYUVtoBGR(const uchar * y_data, const uchar * uv_data, size_t src_step,
                          uchar * dst_data, size_t dst_step,
                          int dst_width, int dst_height,
-                         int dcn, bool swapBlue, int uIdx)
+                         int dcn, bool swapBlue, int uIdx, AlgorithmHint hint)
 {
     CV_INSTRUMENT_REGION();
 
-    CALL_HAL(cvtTwoPlaneYUVtoBGREx, cv_hal_cvtTwoPlaneYUVtoBGREx,
-             y_data, y_step, uv_data, uv_step, dst_data, dst_step, dst_width, dst_height, dcn, swapBlue, uIdx);
-
-    CV_CPU_DISPATCH(cvtTwoPlaneYUVtoBGR, (y_data, y_step, uv_data, uv_step, dst_data, dst_step, dst_width, dst_height, dcn, swapBlue, uIdx),
-        CV_CPU_DISPATCH_MODES_ALL);
+    cvtTwoPlaneYUVtoBGR(y_data, src_step, uv_data, src_step, dst_data, dst_step, dst_width, dst_height, dcn, swapBlue, uIdx, hint);
 }
 
 // 4:2:0, three planes in one array: Y, U, V
 // Y : [16, 235]; Cb, Cr: [16, 240] centered at 128
 // 20-bit fixed-point arithmetics
-void cvtThreePlaneYUVtoBGR(const uchar * src_data, size_t src_step,
+static void cvtThreePlaneYUVtoBGR(const uchar * src_data, size_t src_step,
                            uchar * dst_data, size_t dst_step,
                            int dst_width, int dst_height,
-                           int dcn, bool swapBlue, int uIdx)
+                           int dcn, bool swapBlue, int uIdx, AlgorithmHint hint)
 {
     CV_INSTRUMENT_REGION();
 
+    if (hint == ALGO_HINT_APPROX)
+    {
+        CALL_HAL(cvtThreePlaneYUVtoBGR, cv_hal_cvtThreePlaneYUVtoBGRApprox, src_data, src_step, dst_data, dst_step, dst_width, dst_height, dcn, swapBlue, uIdx);
+    }
+
     CALL_HAL(cvtThreePlaneYUVtoBGR, cv_hal_cvtThreePlaneYUVtoBGR, src_data, src_step, dst_data, dst_step, dst_width, dst_height, dcn, swapBlue, uIdx);
 
     CV_CPU_DISPATCH(cvtThreePlaneYUVtoBGR, (src_data, src_step, dst_data, dst_step, dst_width, dst_height, dcn, swapBlue, uIdx),
@@ -181,46 +207,39 @@ void cvtThreePlaneYUVtoBGR(const uchar * src_data, size_t src_step,
 // 4:2:0, three planes in one array: Y, U, V
 // Y : [16, 235]; Cb, Cr: [16, 240] centered at 128
 // 20-bit fixed-point arithmetics
-void cvtBGRtoThreePlaneYUV(const uchar * src_data, size_t src_step,
+static void cvtBGRtoThreePlaneYUV(const uchar * src_data, size_t src_step,
                            uchar * dst_data, size_t dst_step,
                            int width, int height,
-                           int scn, bool swapBlue, int uIdx)
+                           int scn, bool swapBlue, int uIdx, AlgorithmHint hint)
 {
     CV_INSTRUMENT_REGION();
 
+    if (hint == ALGO_HINT_APPROX)
+    {
+        CALL_HAL(cvtBGRtoThreePlaneYUV, cv_hal_cvtBGRtoThreePlaneYUVApprox, src_data, src_step, dst_data, dst_step, width, height, scn, swapBlue, uIdx);
+    }
+
     CALL_HAL(cvtBGRtoThreePlaneYUV, cv_hal_cvtBGRtoThreePlaneYUV, src_data, src_step, dst_data, dst_step, width, height, scn, swapBlue, uIdx);
 
     CV_CPU_DISPATCH(cvtBGRtoThreePlaneYUV, (src_data, src_step, dst_data, dst_step, width, height, scn, swapBlue, uIdx),
         CV_CPU_DISPATCH_MODES_ALL);
 }
 
-// 4:2:0, two planes: Y, UV interleaved
-// Y : [16, 235]; Cb, Cr: [16, 240] centered at 128
-// 20-bit fixed-point arithmetics
-void cvtBGRtoTwoPlaneYUV(const uchar * src_data, size_t src_step,
-                         uchar * y_data, uchar * uv_data, size_t dst_step,
-                         int width, int height,
-                         int scn, bool swapBlue, int uIdx)
-{
-    CV_INSTRUMENT_REGION();
-
-    CALL_HAL(cvtBGRtoTwoPlaneYUV, cv_hal_cvtBGRtoTwoPlaneYUV,
-             src_data, src_step, y_data, dst_step, uv_data, dst_step, width, height, scn, swapBlue, uIdx);
-
-    CV_CPU_DISPATCH(cvtBGRtoTwoPlaneYUV, (src_data, src_step, y_data, uv_data, dst_step, width, height, scn, swapBlue, uIdx),
-        CV_CPU_DISPATCH_MODES_ALL);
-}
-
 // 4:2:2 interleaved
 // Y : [16, 235]; Cb, Cr: [16, 240] centered at 128
 // 20-bit fixed-point arithmetics
-void cvtOnePlaneYUVtoBGR(const uchar * src_data, size_t src_step,
+static void cvtOnePlaneYUVtoBGR(const uchar * src_data, size_t src_step,
                          uchar * dst_data, size_t dst_step,
                          int width, int height,
-                         int dcn, bool swapBlue, int uIdx, int ycn)
+                         int dcn, bool swapBlue, int uIdx, int ycn, AlgorithmHint hint)
 {
     CV_INSTRUMENT_REGION();
 
+    if (hint == ALGO_HINT_APPROX)
+    {
+        CALL_HAL(cvtOnePlaneYUVtoBGR, cv_hal_cvtOnePlaneYUVtoBGRApprox, src_data, src_step, dst_data, dst_step, width, height, dcn, swapBlue, uIdx, ycn);
+    }
+
     CALL_HAL(cvtOnePlaneYUVtoBGR, cv_hal_cvtOnePlaneYUVtoBGR, src_data, src_step, dst_data, dst_step, width, height, dcn, swapBlue, uIdx, ycn);
 
     CV_CPU_DISPATCH(cvtOnePlaneYUVtoBGR, (src_data, src_step, dst_data, dst_step, width, height, dcn, swapBlue, uIdx, ycn),
@@ -230,13 +249,18 @@ void cvtOnePlaneYUVtoBGR(const uchar * src_data, size_t src_step,
 // 4:2:2 interleaved
 // Y : [16, 235]; Cb, Cr: [16, 240] centered at 128
 // 14-bit fixed-point arithmetics is used
-void cvtOnePlaneBGRtoYUV(const uchar * src_data, size_t src_step,
+static void cvtOnePlaneBGRtoYUV(const uchar * src_data, size_t src_step,
                          uchar * dst_data, size_t dst_step,
                          int width, int height,
-                         int scn, bool swapBlue, int uIdx, int ycn)
+                         int scn, bool swapBlue, int uIdx, int ycn, AlgorithmHint hint)
 {
     CV_INSTRUMENT_REGION();
 
+    if (hint == ALGO_HINT_APPROX)
+    {
+        CALL_HAL(cvtOnePlaneBGRtoYUV, cv_hal_cvtOnePlaneBGRtoYUVApprox, src_data, src_step, dst_data, dst_step, width, height, scn, swapBlue, uIdx, ycn);
+    }
+
     CALL_HAL(cvtOnePlaneBGRtoYUV, cv_hal_cvtOnePlaneBGRtoYUV, src_data, src_step, dst_data, dst_step, width, height, scn, swapBlue, uIdx, ycn);
 
     CV_CPU_DISPATCH(cvtOnePlaneBGRtoYUV, (src_data, src_step, dst_data, dst_step, width, height, scn, swapBlue, uIdx, ycn),
@@ -386,43 +410,43 @@ bool oclCvtColorBGR2ThreePlaneYUV( InputArray _src, OutputArray _dst, int bidx,
 // HAL calls
 //
 
-void cvtColorBGR2YUV(InputArray _src, OutputArray _dst, bool swapb, bool crcb)
+void cvtColorBGR2YUV(InputArray _src, OutputArray _dst, AlgorithmHint hint, bool swapb, bool crcb)
 {
     CvtHelper< Set<3, 4>, Set<3>, Set<CV_8U, CV_16U, CV_32F> > h(_src, _dst, 3);
 
     hal::cvtBGRtoYUV(h.src.data, h.src.step, h.dst.data, h.dst.step, h.src.cols, h.src.rows,
-                     h.depth, h.scn, swapb, crcb);
+                     h.depth, h.scn, swapb, crcb, hint);
 }
 
-void cvtColorYUV2BGR(InputArray _src, OutputArray _dst, int dcn, bool swapb, bool crcb)
+void cvtColorYUV2BGR(InputArray _src, OutputArray _dst, AlgorithmHint hint, int dcn, bool swapb, bool crcb)
 {
     if(dcn <= 0) dcn = 3;
     CvtHelper< Set<3>, Set<3, 4>, Set<CV_8U, CV_16U, CV_32F> > h(_src, _dst, dcn);
 
     hal::cvtYUVtoBGR(h.src.data, h.src.step, h.dst.data, h.dst.step, h.src.cols, h.src.rows,
-                     h.depth, dcn, swapb, crcb);
+                     h.depth, dcn, swapb, crcb, hint);
 }
 
 // 4:2:2 interleaved
 // Y : [16, 235]; Cb, Cr: [16, 240] centered at 128
 // 20-bit fixed-point arithmetics
-void cvtColorOnePlaneYUV2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb, int uidx, int ycn)
+void cvtColorOnePlaneYUV2BGR( InputArray _src, OutputArray _dst, AlgorithmHint hint, int dcn, bool swapb, int uidx, int ycn)
 {
     CvtHelper< Set<2>, Set<3, 4>, Set<CV_8U>, FROM_UYVY > h(_src, _dst, dcn);
 
     hal::cvtOnePlaneYUVtoBGR(h.src.data, h.src.step, h.dst.data, h.dst.step, h.src.cols, h.src.rows,
-                             dcn, swapb, uidx, ycn);
+                             dcn, swapb, uidx, ycn, hint);
 }
 
 // 4:2:2 interleaved
 // Y : [16, 235]; Cb, Cr: [16, 240] centered at 128
 // 14-bit fixed-point arithmetics is used
-void cvtColorOnePlaneBGR2YUV( InputArray _src, OutputArray _dst, bool swapb, int uidx, int ycn)
+void cvtColorOnePlaneBGR2YUV( InputArray _src, OutputArray _dst, AlgorithmHint hint, bool swapb, int uidx, int ycn)
 {
     CvtHelper< Set<3, 4>, Set<2>, Set<CV_8U>, TO_UYVY > h(_src, _dst, 2);
 
     hal::cvtOnePlaneBGRtoYUV(h.src.data, h.src.step, h.dst.data, h.dst.step, h.src.cols, h.src.rows,
-                             h.scn, swapb, uidx, ycn);
+                             h.scn, swapb, uidx, ycn, hint);
 }
 
 void cvtColorYUV2Gray_ch( InputArray _src, OutputArray _dst, int coi )
@@ -435,12 +459,12 @@ void cvtColorYUV2Gray_ch( InputArray _src, OutputArray _dst, int coi )
 // 4:2:0, three planes in one array: Y, U, V
 // Y : [16, 235]; Cb, Cr: [16, 240] centered at 128
 // 20-bit fixed-point arithmetics
-void cvtColorBGR2ThreePlaneYUV( InputArray _src, OutputArray _dst, bool swapb, int uidx)
+void cvtColorBGR2ThreePlaneYUV( InputArray _src, OutputArray _dst, AlgorithmHint hint, bool swapb, int uidx)
 {
     CvtHelper< Set<3, 4>, Set<1>, Set<CV_8U>, TO_YUV > h(_src, _dst, 1);
 
     hal::cvtBGRtoThreePlaneYUV(h.src.data, h.src.step, h.dst.data, h.dst.step, h.src.cols, h.src.rows,
-                               h.scn, swapb, uidx);
+                               h.scn, swapb, uidx, hint);
 }
 
 void cvtColorYUV2Gray_420( InputArray _src, OutputArray _dst )
@@ -460,32 +484,32 @@ void cvtColorYUV2Gray_420( InputArray _src, OutputArray _dst )
 // 4:2:0, three planes in one array: Y, U, V
 // Y : [16, 235]; Cb, Cr: [16, 240] centered at 128
 // 20-bit fixed-point arithmetics
-void cvtColorThreePlaneYUV2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb, int uidx)
+void cvtColorThreePlaneYUV2BGR( InputArray _src, OutputArray _dst, AlgorithmHint hint, int dcn, bool swapb, int uidx)
 {
     if(dcn <= 0) dcn = 3;
     CvtHelper< Set<1>, Set<3, 4>, Set<CV_8U>, FROM_YUV> h(_src, _dst, dcn);
 
     hal::cvtThreePlaneYUVtoBGR(h.src.data, h.src.step, h.dst.data, h.dst.step, h.dst.cols, h.dst.rows,
-                               dcn, swapb, uidx);
+                               dcn, swapb, uidx, hint);
 }
 
 // 4:2:0, two planes in one array: Y, UV interleaved
 // Y : [16, 235]; Cb, Cr: [16, 240] centered at 128
 // 20-bit fixed-point arithmetics
 // see also: http://www.fourcc.org/yuv.php#NV21, http://www.fourcc.org/yuv.php#NV12
-void cvtColorTwoPlaneYUV2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb, int uidx )
+void cvtColorTwoPlaneYUV2BGR( InputArray _src, OutputArray _dst, AlgorithmHint hint, int dcn, bool swapb, int uidx )
 {
     if(dcn <= 0) dcn = 3;
     CvtHelper< Set<1>, Set<3, 4>, Set<CV_8U>, FROM_YUV> h(_src, _dst, dcn);
 
     hal::cvtTwoPlaneYUVtoBGR(h.src.data, h.src.step, h.dst.data, h.dst.step, h.dst.cols, h.dst.rows,
-                             dcn, swapb, uidx);
+                             dcn, swapb, uidx, hint);
 }
 
 // 4:2:0, two planes: Y, UV interleaved
 // Y : [16, 235]; Cb, Cr: [16, 240] centered at 128
 // 20-bit fixed-point arithmetics
-void cvtColorTwoPlaneYUV2BGRpair( InputArray _ysrc, InputArray _uvsrc, OutputArray _dst, int dcn, bool swapb, int uidx )
+void cvtColorTwoPlaneYUV2BGRpair( InputArray _ysrc, InputArray _uvsrc, OutputArray _dst, AlgorithmHint hint, int dcn, bool swapb, int uidx )
 {
     int stype = _ysrc.type();
     int depth = CV_MAT_DEPTH(stype);
@@ -503,13 +527,13 @@ void cvtColorTwoPlaneYUV2BGRpair( InputArray _ysrc, InputArray _uvsrc, OutputArr
     {
         hal::cvtTwoPlaneYUVtoBGR(ysrc.data, uvsrc.data, ysrc.step,
                                  dst.data, dst.step, dst.cols, dst.rows,
-                                 dcn, swapb, uidx);
+                                 dcn, swapb, uidx, hint);
     }
     else
     {
         hal::cvtTwoPlaneYUVtoBGR(ysrc.data, ysrc.step, uvsrc.data, uvsrc.step,
                                 dst.data, dst.step, dst.cols, dst.rows,
-                                dcn, swapb, uidx);
+                                dcn, swapb, uidx, hint);
     }
 }
 
diff --git a/modules/imgproc/src/hal_replacement.hpp b/modules/imgproc/src/hal_replacement.hpp
index ceb6c8b0f6..1409dda991 100644
--- a/modules/imgproc/src/hal_replacement.hpp
+++ b/modules/imgproc/src/hal_replacement.hpp
@@ -499,6 +499,23 @@ inline int hal_ni_cvtGraytoBGR5x5(const uchar * src_data, size_t src_step, uchar
  */
 inline int hal_ni_cvtBGRtoYUV(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int scn, bool swapBlue, bool isCbCr) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
 
+/**
+   @brief Analog of hal_cvtBGRtoYUV, but allows approximations (not bit-exact)
+   @param src_data source image data
+   @param src_step source image step
+   @param dst_data destination image data
+   @param dst_step destination image step
+   @param width image width
+   @param height image height
+   @param depth image depth (one of CV_8U, CV_16U or CV_32F)
+   @param scn source image channels (3 or 4)
+   @param swapBlue if set to true B and R source channels will be swapped (treat as RGB)
+   @param isCbCr if set to true write output in YCbCr format
+   Convert from BGR, RGB, BGRA or RGBA to YUV or YCbCr.
+ */
+inline int hal_ni_cvtBGRtoYUVApprox(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int scn, bool swapBlue, bool isCbCr) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+
+
 /**
    @brief hal_cvtYUVtoBGR
    @param src_data source image data
@@ -515,6 +532,22 @@ inline int hal_ni_cvtBGRtoYUV(const uchar * src_data, size_t src_step, uchar * d
  */
 inline int hal_ni_cvtYUVtoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int dcn, bool swapBlue, bool isCbCr) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
 
+/**
+   @brief Analog of hal_cvtYUVtoBGR, but allows approximations (not bit-exact)
+   @param src_data source image data
+   @param src_step source image step
+   @param dst_data destination image data
+   @param dst_step destination image step
+   @param width image width
+   @param height image height
+   @param depth image depth (one of CV_8U, CV_16U or CV_32F)
+   @param dcn destination image channels (3 or 4)
+   @param swapBlue if set to true B and R destination channels will be swapped (write RGB)
+   @param isCbCr if set to true treat source as YCbCr
+   Convert from YUV or YCbCr to BGR, RGB, BGRA or RGBA.
+ */
+inline int hal_ni_cvtYUVtoBGRApprox(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int dcn, bool swapBlue, bool isCbCr) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+
 /**
    @brief hal_cvtBGRtoXYZ
    @param src_data source image data
@@ -630,6 +663,24 @@ inline int hal_ni_cvtLabtoBGR(const uchar * src_data, size_t src_step, uchar * d
  */
 inline int hal_ni_cvtTwoPlaneYUVtoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int dst_width, int dst_height, int dcn, bool swapBlue, int uIdx) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
 
+/**
+   @brief analog of hal_cvtTwoPlaneYUVtoBGR that allows approximations (not bit-exact)
+   @param src_data source image data
+   @param src_step source image step
+   @param dst_data destination image data
+   @param dst_step destination image step
+   @param dst_width destination image width
+   @param dst_height destination image height
+   @param dcn destination image channels (3 or 4)
+   @param swapBlue if set to true B and R destination channels will be swapped (write RGB)
+   @param uIdx U-channel index in the interleaved U/V plane (0 or 1)
+   Convert from YUV (YUV420sp (or NV12/NV21) - Y plane followed by interleaved U/V plane) to BGR, RGB, BGRA or RGBA.
+   Only for CV_8U.
+   Y : [16, 235]; Cb, Cr: [16, 240] centered at 128
+ */
+inline int hal_ni_cvtTwoPlaneYUVtoBGRApprox(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int dst_width, int dst_height, int dcn, bool swapBlue, int uIdx) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+
+
 /**
    @brief Extended version of hal_cvtTwoPlaneYUVtoBGR.
    @param y_data source image data (Y-plane)
@@ -651,6 +702,27 @@ inline int hal_ni_cvtTwoPlaneYUVtoBGREx(const uchar * y_data, size_t y_step, con
                                       uchar * dst_data, size_t dst_step, int dst_width, int dst_height,
                                       int dcn, bool swapBlue, int uIdx) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
 
+/**
+   @brief Extended version of hal_cvtTwoPlaneYUVtoBGR that allows approximations (not bit-exact)
+   @param y_data source image data (Y-plane)
+   @param y_step source image step (Y-plane)
+   @param uv_data source image data (UV-plane)
+   @param uv_step source image step (UV-plane)
+   @param dst_data destination image data
+   @param dst_step destination image step
+   @param dst_width destination image width
+   @param dst_height destination image height
+   @param dcn destination image channels (3 or 4)
+   @param swapBlue if set to true B and R destination channels will be swapped (write RGB)
+   @param uIdx U-channel index in the interleaved U/V plane (0 or 1)
+   Convert from YUV (YUV420sp (or NV12/NV21) - Y plane followed by interleaved U/V plane) to BGR, RGB, BGRA or RGBA.
+   Only for CV_8U.
+   Y : [16, 235]; Cb, Cr: [16, 240] centered at 128
+ */
+inline int hal_ni_cvtTwoPlaneYUVtoBGRExApprox(const uchar * y_data, size_t y_step, const uchar * uv_data, size_t uv_step,
+                                      uchar * dst_data, size_t dst_step, int dst_width, int dst_height,
+                                      int dcn, bool swapBlue, int uIdx) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+
 /**
    @brief hal_cvtBGRtoTwoPlaneYUV
    @param src_data source image data
@@ -690,6 +762,23 @@ inline int hal_ni_cvtBGRtoTwoPlaneYUV(const uchar * src_data, size_t src_step,
  */
 inline int hal_ni_cvtThreePlaneYUVtoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int dst_width, int dst_height, int dcn, bool swapBlue, int uIdx) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
 
+/**
+   @brief Analog of hal_cvtThreePlaneYUVtoBGR that allows approximations (not bit-exact)
+   @param src_data source image data
+   @param src_step source image step
+   @param dst_data destination image data
+   @param dst_step destination image step
+   @param dst_width destination image width
+   @param dst_height destination image height
+   @param dcn destination image channels (3 or 4)
+   @param swapBlue if set to true B and R destination channels will be swapped (write RGB)
+   @param uIdx U-channel plane index (0 or 1)
+   Convert from YUV (YUV420p (or YV12/YV21) - Y plane followed by U and V planes) to BGR, RGB, BGRA or RGBA.
+   Only for CV_8U.
+   Y : [16, 235]; Cb, Cr: [16, 240] centered at 128
+ */
+inline int hal_ni_cvtThreePlaneYUVtoBGRApprox(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int dst_width, int dst_height, int dcn, bool swapBlue, int uIdx) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+
 /**
    @brief hal_cvtBGRtoThreePlaneYUV
    @param src_data source image data
@@ -707,6 +796,24 @@ inline int hal_ni_cvtThreePlaneYUVtoBGR(const uchar * src_data, size_t src_step,
  */
 inline int hal_ni_cvtBGRtoThreePlaneYUV(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int scn, bool swapBlue, int uIdx) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
 
+/**
+   @brief Analog of hal_cvtBGRtoThreePlaneYUV that allows approximations (not bit-exact)
+   @param src_data source image data
+   @param src_step source image step
+   @param dst_data destination image data
+   @param dst_step destination image step
+   @param width image width
+   @param height image height
+   @param scn source image channels (3 or 4)
+   @param swapBlue if set to true B and R source channels will be swapped (treat as RGB)
+   @param uIdx U-channel plane index (0 or 1)
+   Convert from BGR, RGB, BGRA or RGBA to YUV (YUV420p (or YV12/YV21) - Y plane followed by U and V planes).
+   Only for CV_8U.
+   Y : [16, 235]; Cb, Cr: [16, 240] centered at 128
+ */
+inline int hal_ni_cvtBGRtoThreePlaneYUVApprox(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int scn, bool swapBlue, int uIdx) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+
+
 /**
    @brief hal_cvtOnePlaneYUVtoBGR
    @param src_data source image data
@@ -725,6 +832,24 @@ inline int hal_ni_cvtBGRtoThreePlaneYUV(const uchar * src_data, size_t src_step,
  */
 inline int hal_ni_cvtOnePlaneYUVtoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int dcn, bool swapBlue, int uIdx, int ycn) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
 
+/**
+   @brief analog of hal_cvtOnePlaneYUVtoBGR that allows approximations (not bit-exact)
+   @param src_data source image data
+   @param src_step source image step
+   @param dst_data destination image data
+   @param dst_step destination image step
+   @param width image width
+   @param height image height
+   @param dcn destination image channels (3 or 4)
+   @param swapBlue if set to true B and R destination channels will be swapped (write RGB)
+   @param uIdx U-channel index (0 or 1)
+   @param ycn Y-channel index (0 or 1)
+   Convert from interleaved YUV 4:2:2 (UYVY, YUY2 or YVYU) to BGR, RGB, BGRA or RGBA.
+   Only for CV_8U.
+   Y : [16, 235]; Cb, Cr: [16, 240] centered at 128
+ */
+inline int hal_ni_cvtOnePlaneYUVtoBGRApprox(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int dcn, bool swapBlue, int uIdx, int ycn) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+
 /**
    @brief hal_cvtOnePlaneBGRtoYUV
    @param src_data,src_step source image data and step
@@ -740,6 +865,21 @@ inline int hal_ni_cvtOnePlaneYUVtoBGR(const uchar * src_data, size_t src_step, u
  */
 inline int hal_ni_cvtOnePlaneBGRtoYUV(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int scn, bool swapBlue, int uIdx, int ycn) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
 
+/**
+   @brief analog of hal_cvtOnePlaneBGRtoYUV that allows approximations (not bit-exact)
+   @param src_data,src_step source image data and step
+   @param dst_data,dst_step destination image data and step
+   @param width,height image size
+   @param scn source image channels (3 or 4)
+   @param swapBlue if set to true B and R destination channels will be swapped (write RGB)
+   @param uIdx U-channel index (0 or 1)
+   @param ycn Y-channel index (0 or 1)
+   Convert from BGR, RGB, BGRA or RGBA to interleaved YUV 4:2:2 (UYVY, YUY2 or YVYU).
+   Only for CV_8U.
+   Y : [16, 235]; Cb, Cr: [16, 240] centered at 128
+ */
+inline int hal_ni_cvtOnePlaneBGRtoYUVApprox(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int scn, bool swapBlue, int uIdx, int ycn) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+
 /**
    @brief hal_cvtRGBAtoMultipliedRGBA
    @param src_data source image data
@@ -775,7 +915,9 @@ inline int hal_ni_cvtMultipliedRGBAtoRGBA(const uchar * src_data, size_t src_ste
 #define cv_hal_cvtBGR5x5toGray hal_ni_cvtBGR5x5toGray
 #define cv_hal_cvtGraytoBGR5x5 hal_ni_cvtGraytoBGR5x5
 #define cv_hal_cvtBGRtoYUV hal_ni_cvtBGRtoYUV
+#define cv_hal_cvtBGRtoYUVApprox hal_ni_cvtBGRtoYUVApprox
 #define cv_hal_cvtYUVtoBGR hal_ni_cvtYUVtoBGR
+#define cv_hal_cvtYUVtoBGRApprox hal_ni_cvtYUVtoBGRApprox
 #define cv_hal_cvtBGRtoXYZ hal_ni_cvtBGRtoXYZ
 #define cv_hal_cvtXYZtoBGR hal_ni_cvtXYZtoBGR
 #define cv_hal_cvtBGRtoHSV hal_ni_cvtBGRtoHSV
@@ -783,12 +925,18 @@ inline int hal_ni_cvtMultipliedRGBAtoRGBA(const uchar * src_data, size_t src_ste
 #define cv_hal_cvtBGRtoLab hal_ni_cvtBGRtoLab
 #define cv_hal_cvtLabtoBGR hal_ni_cvtLabtoBGR
 #define cv_hal_cvtTwoPlaneYUVtoBGR hal_ni_cvtTwoPlaneYUVtoBGR
+#define cv_hal_cvtTwoPlaneYUVtoBGRApprox hal_ni_cvtTwoPlaneYUVtoBGRApprox
 #define cv_hal_cvtTwoPlaneYUVtoBGREx hal_ni_cvtTwoPlaneYUVtoBGREx
+#define cv_hal_cvtTwoPlaneYUVtoBGRExApprox hal_ni_cvtTwoPlaneYUVtoBGRExApprox
 #define cv_hal_cvtBGRtoTwoPlaneYUV hal_ni_cvtBGRtoTwoPlaneYUV
 #define cv_hal_cvtThreePlaneYUVtoBGR hal_ni_cvtThreePlaneYUVtoBGR
+#define cv_hal_cvtThreePlaneYUVtoBGRApprox hal_ni_cvtThreePlaneYUVtoBGRApprox
 #define cv_hal_cvtBGRtoThreePlaneYUV hal_ni_cvtBGRtoThreePlaneYUV
+#define cv_hal_cvtBGRtoThreePlaneYUVApprox hal_ni_cvtBGRtoThreePlaneYUVApprox
 #define cv_hal_cvtOnePlaneYUVtoBGR hal_ni_cvtOnePlaneYUVtoBGR
+#define cv_hal_cvtOnePlaneYUVtoBGRApprox hal_ni_cvtOnePlaneYUVtoBGRApprox
 #define cv_hal_cvtOnePlaneBGRtoYUV hal_ni_cvtOnePlaneBGRtoYUV
+#define cv_hal_cvtOnePlaneBGRtoYUVApprox hal_ni_cvtOnePlaneBGRtoYUVApprox
 #define cv_hal_cvtRGBAtoMultipliedRGBA hal_ni_cvtRGBAtoMultipliedRGBA
 #define cv_hal_cvtMultipliedRGBAtoRGBA hal_ni_cvtMultipliedRGBAtoRGBA
 //! @endcond
diff --git a/modules/imgproc/test/test_color.cpp b/modules/imgproc/test/test_color.cpp
index 60862b2805..1229a468eb 100644
--- a/modules/imgproc/test/test_color.cpp
+++ b/modules/imgproc/test/test_color.cpp
@@ -2657,7 +2657,7 @@ TEST(Imgproc_ColorLab_Full, bitExactness)
             Mat probe(256, 256, CV_8UC3), result;
             rng.fill(probe, RNG::UNIFORM, 0, 255, true);
 
-            cvtColor(probe, result, codes[c]);
+            cvtColor(probe, result, codes[c], 0, ALGO_HINT_ACCURATE);
 
             uint32_t h = adler32(result);
             uint32_t goodHash = hashes[c*nIterations + iter];
@@ -2749,7 +2749,7 @@ TEST(Imgproc_ColorLuv_Full, bitExactness)
             Mat probe(256, 256, CV_8UC3), result;
             rng.fill(probe, RNG::UNIFORM, 0, 255, true);
 
-            cvtColor(probe, result, codes[c]);
+            cvtColor(probe, result, codes[c], 0, ALGO_HINT_ACCURATE);
 
             uint32_t h = adler32(result);
             uint32_t goodHash = hashes[c*nIterations + iter];
@@ -2808,7 +2808,7 @@ void runCvtColorBitExactCheck(ColorConversionCodes code, int inputType, uint32_t
     Mat dst;
     rng.fill(src, RNG::UNIFORM, 0, 255, true);
 
-    cv::cvtColor(src, dst, code);
+    cv::cvtColor(src, dst, code, 0, ALGO_HINT_ACCURATE);
 
     uint32_t dst_hash = adler32(dst);