Merge branch 4.x

6 months ago · 7e8f2a1bc4
parent 35eba9ca90 49459d46e2
commit 7e8f2a1bc4
75 changed files with 1867 additions and 769 deletions
--- a/3rdparty/ndsrvp/include/core.hpp
+++ b/3rdparty/ndsrvp/include/core.hpp
@ -1,6 +1,6 @@
 // This file is part of OpenCV project.
 // It is subject to the license terms in the LICENSE file found in the top-level directory
-// of this distribution and at http://opencv.org/license.html.	
+// of this distribution and at http://opencv.org/license.html.

 #ifndef OPENCV_NDSRVP_CORE_HPP
 #define OPENCV_NDSRVP_CORE_HPP
--- a/3rdparty/ndsrvp/include/imgproc.hpp
+++ b/3rdparty/ndsrvp/include/imgproc.hpp
@ -1,18 +1,12 @@
 // This file is part of OpenCV project.
 // It is subject to the license terms in the LICENSE file found in the top-level directory
-// of this distribution and at http://opencv.org/license.html.	
+// of this distribution and at http://opencv.org/license.html.

 #ifndef OPENCV_NDSRVP_IMGPROC_HPP
 #define OPENCV_NDSRVP_IMGPROC_HPP

 namespace cv {

-// ################ remap ################
-
-void remap(InputArray _src, OutputArray _dst,
-    InputArray _map1, InputArray _map2,
-    int interpolation, int borderType, const Scalar& borderValue);
-
 namespace ndsrvp {

 enum InterpolationMasks {
@ -36,23 +30,36 @@ int integral(int depth, int sdepth, int sqdepth,

 // ################ warpAffine ################

-int warpAffine(int src_type,
-    const uchar* src_data, size_t src_step, int src_width, int src_height,
-    uchar* dst_data, size_t dst_step, int dst_width, int dst_height,
-    const double M[6], int interpolation, int borderType, const double borderValue[4]);
+int warpAffineBlocklineNN(int *adelta, int *bdelta, short* xy, int X0, int Y0, int bw);

-#undef cv_hal_warpAffine
-#define cv_hal_warpAffine (cv::ndsrvp::warpAffine)
+#undef cv_hal_warpAffineBlocklineNN
+#define cv_hal_warpAffineBlocklineNN (cv::ndsrvp::warpAffineBlocklineNN)
+
+int warpAffineBlockline(int *adelta, int *bdelta, short* xy, short* alpha, int X0, int Y0, int bw);
+
+#undef cv_hal_warpAffineBlockline
+#define cv_hal_warpAffineBlockline (cv::ndsrvp::warpAffineBlockline)

 // ################ warpPerspective ################

-int warpPerspective(int src_type,
-    const uchar* src_data, size_t src_step, int src_width, int src_height,
-    uchar* dst_data, size_t dst_step, int dst_width, int dst_height,
-    const double M[9], int interpolation, int borderType, const double borderValue[4]);
+int warpPerspectiveBlocklineNN(const double *M, short* xy, double X0, double Y0, double W0, int bw);
+
+#undef cv_hal_warpPerspectiveBlocklineNN
+#define cv_hal_warpPerspectiveBlocklineNN (cv::ndsrvp::warpPerspectiveBlocklineNN)
+
+int warpPerspectiveBlockline(const double *M, short* xy, short* alpha, double X0, double Y0, double W0, int bw);
+
+#undef cv_hal_warpPerspectiveBlockline
+#define cv_hal_warpPerspectiveBlockline (cv::ndsrvp::warpPerspectiveBlockline)
+
+// ################ remap ################
+
+int remap32f(int src_type, const uchar *src_data, size_t src_step, int src_width, int src_height,
+    uchar *dst_data, size_t dst_step, int dst_width, int dst_height, float* mapx, size_t mapx_step,
+    float* mapy, size_t mapy_step, int interpolation, int border_type, const double border_value[4]);

-#undef cv_hal_warpPerspective
-#define cv_hal_warpPerspective (cv::ndsrvp::warpPerspective)
+#undef cv_hal_remap32f
+#define cv_hal_remap32f (cv::ndsrvp::remap32f)

 // ################ threshold ################

--- a/3rdparty/ndsrvp/ndsrvp_hal.hpp
+++ b/3rdparty/ndsrvp/ndsrvp_hal.hpp
@ -1,13 +1,14 @@
 // This file is part of OpenCV project.
 // It is subject to the license terms in the LICENSE file found in the top-level directory
-// of this distribution and at http://opencv.org/license.html.	
+// of this distribution and at http://opencv.org/license.html.

 #ifndef OPENCV_NDSRVP_HAL_HPP
 #define OPENCV_NDSRVP_HAL_HPP

-#include "opencv2/core/mat.hpp"
 #include <nds_intrinsic.h>

+#include "opencv2/core/hal/interface.h"
+
 #include "include/core.hpp"
 #include "include/imgproc.hpp"
 #include "include/features2d.hpp"
--- a/3rdparty/ndsrvp/src/cvutils.cpp
+++ b/3rdparty/ndsrvp/src/cvutils.cpp
@ -0,0 +1,78 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.	
+
+#include "cvutils.hpp"
+
+namespace cv {
+
+namespace ndsrvp {
+
+// fastMalloc
+
+// [0][1][2][3][4][5][6][7][8][9]
+//     ^udata
+//                          ^adata
+//              ^adata[-1] == udata
+
+void* fastMalloc(size_t size)
+{
+    uchar* udata = (uchar*)malloc(size + sizeof(void*) + CV_MALLOC_ALIGN);
+    if(!udata)
+        ndsrvp_error(Error::StsNoMem, "fastMalloc(): Not enough memory");
+    uchar** adata = (uchar**)align((size_t)((uchar**)udata + 1), CV_MALLOC_ALIGN);
+    adata[-1] = udata;
+    return adata;
+}
+
+void fastFree(void* ptr)
+{
+    if(ptr)
+    {
+        uchar* udata = ((uchar**)ptr)[-1];
+        if(!(udata < (uchar*)ptr && ((uchar*)ptr - udata) <= (ptrdiff_t)(sizeof(void*) + CV_MALLOC_ALIGN)))
+            ndsrvp_error(Error::StsBadArg, "fastFree(): Invalid memory block");
+        free(udata);
+    }
+}
+
+// borderInterpolate
+
+int borderInterpolate(int p, int len, int borderType)
+{
+    if( (unsigned)p < (unsigned)len )
+        ;
+    else if( borderType == CV_HAL_BORDER_REPLICATE )
+        p = p < 0 ? 0 : len - 1;
+    else if( borderType == CV_HAL_BORDER_REFLECT || borderType == CV_HAL_BORDER_REFLECT_101 )
+    {
+        int delta = borderType == CV_HAL_BORDER_REFLECT_101;
+        if( len == 1 )
+            return 0;
+        do
+        {
+            if( p < 0 )
+                p = -p - 1 + delta;
+            else
+                p = len - 1 - (p - len) - delta;
+        }
+        while( (unsigned)p >= (unsigned)len );
+    }
+    else if( borderType == CV_HAL_BORDER_WRAP )
+    {
+        ndsrvp_assert(len > 0);
+        if( p < 0 )
+            p -= ((p - len + 1) / len) * len;
+        if( p >= len )
+            p %= len;
+    }
+    else if( borderType == CV_HAL_BORDER_CONSTANT )
+        p = -1;
+    else
+        ndsrvp_error(Error::StsBadArg, "borderInterpolate(): Unknown/unsupported border type");
+    return p;
+}
+
+} // namespace ndsrvp
+
+} // namespace cv
--- a/3rdparty/ndsrvp/src/cvutils.hpp
+++ b/3rdparty/ndsrvp/src/cvutils.hpp
@ -0,0 +1,108 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.	
+
+#ifndef OPENCV_NDSRVP_CVUTILS_HPP
+#define OPENCV_NDSRVP_CVUTILS_HPP
+
+#include <nds_intrinsic.h>
+
+#include "opencv2/core/hal/interface.h"
+
+#include <cstring>
+#include <cmath>
+#include <iostream>
+#include <string>
+#include <array>
+#include <climits>
+#include <algorithm>
+
+// misc functions that not exposed to public interface
+
+namespace cv {
+
+namespace ndsrvp {
+
+void* fastMalloc(size_t size);
+void fastFree(void* ptr);
+int borderInterpolate(int p, int len, int borderType);
+
+#ifndef MAX
+#  define MAX(a,b)  ((a) < (b) ? (b) : (a))
+#endif
+
+#define CV_MAT_CN_MASK          ((CV_CN_MAX - 1) << CV_CN_SHIFT)
+#define CV_MAT_CN(flags)        ((((flags) & CV_MAT_CN_MASK) >> CV_CN_SHIFT) + 1)
+
+#define CV_MALLOC_ALIGN 64
+
+// error codes
+
+enum Error{
+    StsNoMem = -4,
+    StsBadArg = -5,
+    StsAssert = -215
+};
+
+// output error
+
+#define ndsrvp_assert(expr) { if(!(expr)) ndsrvp_error(Error::StsAssert, std::string(#expr)); }
+
+inline void ndsrvp_error(int code, std::string msg = "")
+{
+    std::cerr << "NDSRVP Error: code " << code << std::endl;
+    if(!msg.empty())
+        std::cerr << msg << std::endl;
+    if(code < 0)
+        throw code;
+}
+
+// clip & vclip
+
+inline int clip(int x, int a, int b)
+{
+    return x >= a ? (x < b ? x : b - 1) : a;
+}
+
+inline int32x2_t vclip(int32x2_t x, int32x2_t a, int32x2_t b)
+{
+    return (int32x2_t)__nds__bpick((long)a, __nds__bpick((long)(b - 1), (long)x, (long)(x < b)), (long)(x >= a));
+}
+
+// saturate
+
+template<typename _Tp> static inline _Tp saturate_cast(int v)    { return _Tp(v); }
+template<typename _Tp> static inline _Tp saturate_cast(float v)    { return _Tp(v); }
+template<typename _Tp> static inline _Tp saturate_cast(double v)   { return _Tp(v); }
+
+template<> inline uchar saturate_cast<uchar>(int v)     { return __nds__uclip32(v, 8); }
+template<> inline uchar saturate_cast<uchar>(float v)     { return saturate_cast<uchar>((int)lrintf(v)); }
+template<> inline uchar saturate_cast<uchar>(double v)     { return saturate_cast<uchar>((int)lrint(v)); }
+
+template<> inline char saturate_cast<char>(int v)     { return __nds__sclip32(v, 7); }
+template<> inline char saturate_cast<char>(float v)     { return saturate_cast<char>((int)lrintf(v)); }
+template<> inline char saturate_cast<char>(double v)     { return saturate_cast<char>((int)lrint(v)); }
+
+template<> inline ushort saturate_cast<ushort>(int v)     { return __nds__uclip32(v, 16); }
+template<> inline ushort saturate_cast<ushort>(float v)     { return saturate_cast<ushort>((int)lrintf(v)); }
+template<> inline ushort saturate_cast<ushort>(double v)     { return saturate_cast<ushort>((int)lrint(v)); }
+
+template<> inline short saturate_cast<short>(int v)     { return __nds__sclip32(v, 15); }
+template<> inline short saturate_cast<short>(float v)     { return saturate_cast<short>((int)lrintf(v)); }
+template<> inline short saturate_cast<short>(double v)     { return saturate_cast<short>((int)lrint(v)); }
+
+template<> inline int saturate_cast<int>(float v)     { return (int)lrintf(v); }
+template<> inline int saturate_cast<int>(double v)     { return (int)lrint(v); }
+
+// align
+
+inline long align(size_t v, int n)
+{
+    return (v + n - 1) & -n;
+}
+
+} // namespace ndsrvp
+
+} // namespace cv
+
+#endif
--- a/3rdparty/ndsrvp/src/integral.cpp
+++ b/3rdparty/ndsrvp/src/integral.cpp
@ -3,6 +3,8 @@
 // of this distribution and at http://opencv.org/license.html.	

 #include "ndsrvp_hal.hpp"
+#include "opencv2/imgproc/hal/interface.h"
+#include "cvutils.hpp"

 namespace cv {

--- a/3rdparty/ndsrvp/src/remap.cpp
+++ b/3rdparty/ndsrvp/src/remap.cpp
@ -0,0 +1,188 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "ndsrvp_hal.hpp"
+#include "opencv2/imgproc/hal/interface.h"
+#include "cvutils.hpp"
+
+namespace cv {
+
+namespace ndsrvp {
+
+int remap32f(int src_type, const uchar* src_data, size_t src_step, int src_width, int src_height,
+    uchar* dst_data, size_t dst_step, int dst_width, int dst_height, float* mapx, size_t mapx_step,
+    float* mapy, size_t mapy_step, int interpolation, int border_type, const double border_value[4])
+{
+    const bool isRelative = ((interpolation & CV_HAL_WARP_RELATIVE_MAP) != 0);
+    interpolation &= ~CV_HAL_WARP_RELATIVE_MAP;
+
+    if( interpolation == CV_HAL_INTER_AREA )
+        interpolation = CV_HAL_INTER_LINEAR;
+
+    if( interpolation != CV_HAL_INTER_NEAREST )
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+
+    // only CV_8U
+    if( (src_type & CV_MAT_DEPTH_MASK) != CV_8U )
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+
+    int cn = CV_MAT_CN(src_type);
+
+    src_step /= sizeof(uchar);
+    dst_step /= sizeof(uchar);
+
+    // mapping CV_32FC1
+    mapx_step /= sizeof(float);
+    mapy_step /= sizeof(float);
+
+    // border
+    uchar border_const[CV_CN_MAX];
+    for( int k = 0; k < CV_CN_MAX; k++ )
+        border_const[k] = saturate_cast<uchar>(border_value[k & 3]);
+
+    // divide into blocks
+    const int BLOCK_SIZE = 1024;
+    int x, y, x1, y1;
+    std::array<short, BLOCK_SIZE * BLOCK_SIZE * 2> aXY;
+    short* XY = aXY.data();
+    size_t XY_step = BLOCK_SIZE * 2;
+
+    // vectorize
+    const int32x2_t src_wh = {src_width, src_height};
+    const int32x2_t arr_index = {cn, (int)src_step};
+
+    for (y = 0; y < dst_height; y += BLOCK_SIZE)
+    {
+        int dy = std::min(BLOCK_SIZE, dst_height - y);
+        for (x = 0; x < dst_width; x += BLOCK_SIZE)
+        {
+            const int off_y = isRelative ? y : 0;
+            const int off_x = isRelative ? x : 0;
+            const int32x2_t voff = {off_x, off_y};
+
+            int dx = std::min(BLOCK_SIZE, dst_width - x);
+            // prepare mapping data XY
+            for (y1 = 0; y1 < dy; y1++)
+            {
+                short* rXY = XY + y1 * XY_step;
+                const float* sX = mapx + (y + y1) * mapx_step + x;
+                const float* sY = mapy + (y + y1) * mapy_step + x;
+                for (x1 = 0; x1 < dx; x1++)
+                {
+                    rXY[x1 * 2] = saturate_cast<short>(sX[x1]);
+                    rXY[x1 * 2 + 1] = saturate_cast<short>(sY[x1]);
+                }
+            }
+
+            // precalulate offset
+            if(isRelative)
+            {
+                int16x8_t voff_x;
+                int16x8_t voff_y = {0, 0, 1, 0, 2, 0, 3, 0};
+                int16x8_t vones_x = {4, 0, 4, 0, 4, 0, 4, 0};
+                int16x8_t vones_y = {0, 1, 0, 1, 0, 1, 0, 1};
+                for(y1 = 0; y1 < BLOCK_SIZE; y1++, voff_y += vones_y)
+                {
+                    int16x8_t* vrXY = (int16x8_t*)(XY + y1 * XY_step);
+                    for(x1 = 0, voff_x = voff_y; x1 < BLOCK_SIZE; x1 += 4, vrXY++, voff_x += vones_x)
+                    {
+                        *vrXY += voff_x;
+                    }
+                }
+            }
+
+            // process the block
+            for( y1 = 0; y1 < dy; y1++ )
+            {
+                uchar* dst_row = dst_data + (y + y1) * dst_step + x * cn;
+                const short* rXY = XY + y1 * XY_step;
+                if( cn == 1 )
+                {
+                    for( x1 = 0; x1 < dx; x1++ )
+                    {
+                        int32x2_t vsxy = (int32x2_t){rXY[x1 * 2], rXY[x1 * 2 + 1]} + voff;
+                        if( (long)((uint32x2_t)vsxy < (uint32x2_t)src_wh) == -1 )
+                            dst_row[x1] = src_data[__nds__v_smar64(0, vsxy, arr_index)];
+                        else
+                        {
+                            if( border_type == CV_HAL_BORDER_REPLICATE )
+                            {
+                                vsxy = vclip(vsxy, (int32x2_t){0, 0}, src_wh);
+                                dst_row[x1] = src_data[__nds__v_smar64(0, vsxy, arr_index)];
+                            }
+                            else if( border_type == CV_HAL_BORDER_CONSTANT )
+                                dst_row[x1] = border_const[0];
+                            else if( border_type != CV_HAL_BORDER_TRANSPARENT )
+                            {
+                                vsxy[0] = borderInterpolate(vsxy[0], src_width, border_type);
+                                vsxy[1] = borderInterpolate(vsxy[1], src_height, border_type);
+                                dst_row[x1] = src_data[__nds__v_smar64(0, vsxy, arr_index)];
+                            }
+                        }
+                    }
+                }
+                else
+                {
+                    uchar* dst_ptr = dst_row;
+                    for(x1 = 0; x1 < dx; x1++, dst_ptr += cn )
+                    {
+                        int32x2_t vsxy = (int32x2_t){rXY[x1 * 2], rXY[x1 * 2 + 1]} + voff;
+                        const uchar *src_ptr;
+                        if( (long)((uint32x2_t)vsxy < (uint32x2_t)src_wh) == -1 )
+                        {
+                            if( cn == 3 )
+                            {
+                                src_ptr = (uchar*)__nds__v_smar64((long)src_data, vsxy, arr_index);
+                                dst_ptr[0] = src_ptr[0]; dst_ptr[1] = src_ptr[1]; dst_ptr[2] = src_ptr[2];
+                                // performance loss, commented out
+                                // *(unsigned*)dst_ptr = __nds__bpick(*(unsigned*)dst_ptr, *(unsigned*)src_ptr, 0xFF000000);
+                            }
+                            else if( cn == 4 )
+                            {
+                                src_ptr = (uchar*)__nds__v_smar64((long)src_data, vsxy, arr_index);
+                                *(uint8x4_t*)dst_ptr = *(uint8x4_t*)src_ptr;
+                            }
+                            else
+                            {
+                                src_ptr = (uchar*)__nds__v_smar64((long)src_data, vsxy, arr_index);
+                                int k = cn;
+                                for(; k >= 8; k -= 8, dst_ptr += 8, src_ptr += 8)
+                                    *(uint8x8_t*)dst_ptr = *(uint8x8_t*)src_ptr;
+                                while( k-- )
+                                    dst_ptr[k] = src_ptr[k];
+                            }
+                        }
+                        else if( border_type != CV_HAL_BORDER_TRANSPARENT )
+                        {
+                            if( border_type == CV_HAL_BORDER_REPLICATE )
+                            {
+                                vsxy = vclip(vsxy, (int32x2_t){0, 0}, src_wh);
+                                src_ptr = (uchar*)__nds__v_smar64((long)src_data, vsxy, arr_index);
+                            }
+                            else if( border_type == CV_HAL_BORDER_CONSTANT )
+                                src_ptr = &border_const[0];
+                            else
+                            {
+                                vsxy[0] = borderInterpolate(vsxy[0], src_width, border_type);
+                                vsxy[1] = borderInterpolate(vsxy[1], src_height, border_type);
+                                src_ptr = (uchar*)__nds__v_smar64((long)src_data, vsxy, arr_index);
+                            }
+                            int k = cn;
+                            for(; k >= 8; k -= 8, dst_ptr += 8, src_ptr += 8)
+                                *(uint8x8_t*)dst_ptr = *(uint8x8_t*)src_ptr;
+                            while( k-- )
+                                dst_ptr[k] = src_ptr[k];
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    return CV_HAL_ERROR_OK;
+}
+
+} // namespace ndsrvp
+
+} // namespace cv
--- a/3rdparty/ndsrvp/src/threshold.cpp
+++ b/3rdparty/ndsrvp/src/threshold.cpp
@ -4,65 +4,44 @@

 #include "ndsrvp_hal.hpp"
 #include "opencv2/imgproc/hal/interface.h"
+#include "cvutils.hpp"

 namespace cv {

 namespace ndsrvp {

 template <typename type, typename vtype>
-class operators_threshold_t {
-public:
-    virtual ~operators_threshold_t() {};
-    virtual inline vtype vector(const vtype& src, const vtype& thresh, const vtype& maxval)
-    {
-        (void)src;
-        (void)thresh;
-        (void)maxval;
-        CV_Error(cv::Error::StsBadArg, "");
-        return vtype();
-    }
-    virtual inline type scalar(const type& src, const type& thresh, const type& maxval)
-    {
-        (void)src;
-        (void)thresh;
-        (void)maxval;
-        CV_Error(cv::Error::StsBadArg, "");
-        return type();
-    }
-};
-
-template <typename type, typename vtype>
-class opThreshBinary : public operators_threshold_t<type, vtype> {
-    inline vtype vector(const vtype& src, const vtype& thresh, const vtype& maxval) override
+struct opThreshBinary_t {
+    inline vtype vector(const vtype& src, const vtype& thresh, const vtype& maxval)
    {
        return (vtype)__nds__bpick((long)maxval, (long)0, (long)(src > thresh));
    }
-    inline type scalar(const type& src, const type& thresh, const type& maxval) override
+    inline type scalar(const type& src, const type& thresh, const type& maxval)
    {
        return src > thresh ? maxval : 0;
    }
 };

 template <typename type, typename vtype>
-class opThreshBinaryInv : public operators_threshold_t<type, vtype> {
-    inline vtype vector(const vtype& src, const vtype& thresh, const vtype& maxval) override
+struct opThreshBinaryInv_t {
+    inline vtype vector(const vtype& src, const vtype& thresh, const vtype& maxval)
    {
        return (vtype)__nds__bpick((long)0, (long)maxval, (long)(src > thresh));
    }
-    inline type scalar(const type& src, const type& thresh, const type& maxval) override
+    inline type scalar(const type& src, const type& thresh, const type& maxval)
    {
        return src > thresh ? 0 : maxval;
    }
 };

 template <typename type, typename vtype>
-class opThreshTrunc : public operators_threshold_t<type, vtype> {
-    inline vtype vector(const vtype& src, const vtype& thresh, const vtype& maxval) override
+struct opThreshTrunc_t {
+    inline vtype vector(const vtype& src, const vtype& thresh, const vtype& maxval)
    {
        (void)maxval;
        return (vtype)__nds__bpick((long)thresh, (long)src, (long)(src > thresh));
    }
-    inline type scalar(const type& src, const type& thresh, const type& maxval) override
+    inline type scalar(const type& src, const type& thresh, const type& maxval)
    {
        (void)maxval;
        return src > thresh ? thresh : src;
@ -70,13 +49,13 @@ class opThreshTrunc : public operators_threshold_t<type, vtype> {
 };

 template <typename type, typename vtype>
-class opThreshToZero : public operators_threshold_t<type, vtype> {
-    inline vtype vector(const vtype& src, const vtype& thresh, const vtype& maxval) override
+struct opThreshToZero_t {
+    inline vtype vector(const vtype& src, const vtype& thresh, const vtype& maxval)
    {
        (void)maxval;
        return (vtype)__nds__bpick((long)src, (long)0, (long)(src > thresh));
    }
-    inline type scalar(const type& src, const type& thresh, const type& maxval) override
+    inline type scalar(const type& src, const type& thresh, const type& maxval)
    {
        (void)maxval;
        return src > thresh ? src : 0;
@ -84,29 +63,36 @@ class opThreshToZero : public operators_threshold_t<type, vtype> {
 };

 template <typename type, typename vtype>
-class opThreshToZeroInv : public operators_threshold_t<type, vtype> {
-    inline vtype vector(const vtype& src, const vtype& thresh, const vtype& maxval) override
+struct opThreshToZeroInv_t {
+    inline vtype vector(const vtype& src, const vtype& thresh, const vtype& maxval)
    {
        (void)maxval;
        return (vtype)__nds__bpick((long)0, (long)src, (long)(src > thresh));
    }
-    inline type scalar(const type& src, const type& thresh, const type& maxval) override
+    inline type scalar(const type& src, const type& thresh, const type& maxval)
    {
        (void)maxval;
        return src > thresh ? 0 : src;
    }
 };

-template <typename type, typename vtype, int nlane>
-static void threshold_op(const type* src_data, size_t src_step,
-    type* dst_data, size_t dst_step,
+template <typename type, typename vtype, int nlane,
+    template <typename ttype, typename vttype> typename opThresh_t>
+static inline void threshold_op(const uchar* src, size_t src_step,
+    uchar* dst, size_t dst_step,
    int width, int height, int cn,
-    type thresh, type maxval, int thtype)
+    double thresh_d, double maxval_d)
 {
    int i, j;
    width *= cn;
+
+    type* src_data = (type*)src;
+    type* dst_data = (type*)dst;
    src_step /= sizeof(type);
    dst_step /= sizeof(type);
+
+    type thresh = saturate_cast<type>(thresh_d);
+    type maxval = saturate_cast<type>(maxval_d);
    vtype vthresh;
    vtype vmaxval;
    for (i = 0; i < nlane; i++) {
@ -114,62 +100,63 @@ static void threshold_op(const type* src_data, size_t src_step,
        vmaxval[i] = maxval;
    }

-    operators_threshold_t<type, vtype>* op;
-    switch (thtype) {
-    case CV_HAL_THRESH_BINARY:
-        op = new opThreshBinary<type, vtype>();
-        break;
-    case CV_HAL_THRESH_BINARY_INV:
-        op = new opThreshBinaryInv<type, vtype>();
-        break;
-    case CV_HAL_THRESH_TRUNC:
-        op = new opThreshTrunc<type, vtype>();
-        break;
-    case CV_HAL_THRESH_TOZERO:
-        op = new opThreshToZero<type, vtype>();
-        break;
-    case CV_HAL_THRESH_TOZERO_INV:
-        op = new opThreshToZeroInv<type, vtype>();
-        break;
-    default:
-        CV_Error(cv::Error::StsBadArg, "");
-        return;
-    }
+    opThresh_t<type, vtype> opThresh;

    for (i = 0; i < height; i++, src_data += src_step, dst_data += dst_step) {
        for (j = 0; j <= width - nlane; j += nlane) {
-            vtype vs = *(vtype*)(src_data + j);
-            *(vtype*)(dst_data + j) = op->vector(vs, vthresh, vmaxval);
+            *(vtype*)(dst_data + j) = opThresh.vector(*(vtype*)(src_data + j), vthresh, vmaxval);
        }
        for (; j < width; j++) {
-            dst_data[j] = op->scalar(src_data[j], thresh, maxval);
+            dst_data[j] = opThresh.scalar(src_data[j], thresh, maxval);
        }
    }

-    delete op;
    return;
 }

+typedef void (*ThreshFunc)(const uchar* src_data, size_t src_step,
+    uchar* dst_data, size_t dst_step,
+    int width, int height, int cn,
+    double thresh, double maxval);
+
 int threshold(const uchar* src_data, size_t src_step,
    uchar* dst_data, size_t dst_step,
    int width, int height, int depth, int cn,
    double thresh, double maxValue, int thresholdType)
 {
-    if (width <= 255 && height <= 255) // slower at small size
-        return CV_HAL_ERROR_NOT_IMPLEMENTED;
-    if (depth == CV_8U) {
-        threshold_op<uchar, uint8x8_t, 8>((uchar*)src_data, src_step, (uchar*)dst_data, dst_step, width, height, cn, (uchar)thresh, (uchar)maxValue, thresholdType);
-        return CV_HAL_ERROR_OK;
-    } else if (depth == CV_16S) {
-        threshold_op<short, int16x4_t, 4>((short*)src_data, src_step, (short*)dst_data, dst_step, width, height, cn, (short)thresh, (short)maxValue, thresholdType);
-        return CV_HAL_ERROR_OK;
-    } else if (depth == CV_16U) {
-        threshold_op<ushort, uint16x4_t, 4>((ushort*)src_data, src_step, (ushort*)dst_data, dst_step, width, height, cn, (ushort)thresh, (ushort)maxValue, thresholdType);
-        return CV_HAL_ERROR_OK;
-    } else {
+    static ThreshFunc thfuncs[4][5] =
+    {
+        {
+            threshold_op<uchar, uint8x8_t, 8, opThreshBinary_t>,
+            threshold_op<uchar, uint8x8_t, 8, opThreshBinaryInv_t>,
+            threshold_op<uchar, uint8x8_t, 8, opThreshTrunc_t>, 
+            threshold_op<uchar, uint8x8_t, 8, opThreshToZero_t>,
+            threshold_op<uchar, uint8x8_t, 8, opThreshToZeroInv_t> },
+        {
+            threshold_op<char, int8x8_t, 8, opThreshBinary_t>,
+            threshold_op<char, int8x8_t, 8, opThreshBinaryInv_t>,
+            threshold_op<char, int8x8_t, 8, opThreshTrunc_t>, 
+            threshold_op<char, int8x8_t, 8, opThreshToZero_t>,
+            threshold_op<char, int8x8_t, 8, opThreshToZeroInv_t> },
+        {
+            threshold_op<ushort, uint16x4_t, 4, opThreshBinary_t>,
+            threshold_op<ushort, uint16x4_t, 4, opThreshBinaryInv_t>,
+            threshold_op<ushort, uint16x4_t, 4, opThreshTrunc_t>,
+            threshold_op<ushort, uint16x4_t, 4, opThreshToZero_t>,
+            threshold_op<ushort, uint16x4_t, 4, opThreshToZeroInv_t> },
+        {
+            threshold_op<short, int16x4_t, 4, opThreshBinary_t>,
+            threshold_op<short, int16x4_t, 4, opThreshBinaryInv_t>,
+            threshold_op<short, int16x4_t, 4, opThreshTrunc_t>,
+            threshold_op<short, int16x4_t, 4, opThreshToZero_t>,
+            threshold_op<short, int16x4_t, 4, opThreshToZeroInv_t> }
+    };
+
+    if(depth < 0 || depth > 3 || thresholdType < 0 || thresholdType > 4 || (width < 256 && height < 256))
        return CV_HAL_ERROR_NOT_IMPLEMENTED;
-    }
-    return CV_HAL_ERROR_NOT_IMPLEMENTED;
+
+    thfuncs[depth][thresholdType](src_data, src_step, dst_data, dst_step, width, height, cn, thresh, maxValue);
+    return CV_HAL_ERROR_OK;
 }

 } // namespace ndsrvp
--- a/3rdparty/ndsrvp/src/warpAffine.cpp
+++ b/3rdparty/ndsrvp/src/warpAffine.cpp
@ -3,148 +3,68 @@
 // of this distribution and at http://opencv.org/license.html.	

 #include "ndsrvp_hal.hpp"
-#include "opencv2/core.hpp"
 #include "opencv2/imgproc/hal/interface.h"
+#include "cvutils.hpp"

 namespace cv {

 namespace ndsrvp {

-class WarpAffineInvoker : public ParallelLoopBody {
-public:
-    WarpAffineInvoker(const Mat& _src, Mat& _dst, int _interpolation, int _borderType,
-        const Scalar& _borderValue, int* _adelta, int* _bdelta, const double* _M)
-        : ParallelLoopBody()
-        , src(_src)
-        , dst(_dst)
-        , interpolation(_interpolation)
-        , borderType(_borderType)
-        , borderValue(_borderValue)
-        , adelta(_adelta)
-        , bdelta(_bdelta)
-        , M(_M)
-    {
+int warpAffineBlocklineNN(int *adelta, int *bdelta, short* xy, int X0, int Y0, int bw)
+{
+    const int AB_BITS = MAX(10, (int)INTER_BITS);
+    int x1 = 0;
+
+    for (; x1 < bw; x1 += 2) {
+        int32x2_t vX = { X0 + adelta[x1], X0 + adelta[x1 + 1] };
+        int32x2_t vY = { Y0 + bdelta[x1], Y0 + bdelta[x1 + 1] };
+
+        vX = __nds__v_sclip32(__nds__v_sra32(vX, AB_BITS), 15);
+        vY = __nds__v_sclip32(__nds__v_sra32(vY, AB_BITS), 15);
+
+        *(uint16x4_t*)(xy + x1 * 2) = (uint16x4_t)__nds__pkbb16((unsigned long)vY, (unsigned long)vX);
    }

-    virtual void operator()(const Range& range) const CV_OVERRIDE
-    {
-        const int BLOCK_SZ = 64;
-        AutoBuffer<short, 0> __XY(BLOCK_SZ * BLOCK_SZ * 2), __A(BLOCK_SZ * BLOCK_SZ);
-        short *XY = __XY.data(), *A = __A.data();
-        const int AB_BITS = MAX(10, (int)INTER_BITS);
-        const int AB_SCALE = 1 << AB_BITS;
-        int round_delta = interpolation == CV_HAL_INTER_NEAREST ? AB_SCALE / 2 : AB_SCALE / INTER_TAB_SIZE / 2, x, y, x1, y1;
-
-        int bh0 = std::min(BLOCK_SZ / 2, dst.rows);
-        int bw0 = std::min(BLOCK_SZ * BLOCK_SZ / bh0, dst.cols);
-        bh0 = std::min(BLOCK_SZ * BLOCK_SZ / bw0, dst.rows);
-
-        for (y = range.start; y < range.end; y += bh0) {
-            for (x = 0; x < dst.cols; x += bw0) {
-                int bw = std::min(bw0, dst.cols - x);
-                int bh = std::min(bh0, range.end - y);
-
-                Mat _XY(bh, bw, CV_16SC2, XY);
-                Mat dpart(dst, Rect(x, y, bw, bh));
-
-                for (y1 = 0; y1 < bh; y1++) {
-                    short* xy = XY + y1 * bw * 2;
-                    int X0 = saturate_cast<int>((M[1] * (y + y1) + M[2]) * AB_SCALE) + round_delta;
-                    int Y0 = saturate_cast<int>((M[4] * (y + y1) + M[5]) * AB_SCALE) + round_delta;
-
-                    if (interpolation == CV_HAL_INTER_NEAREST) {
-                        x1 = 0;
-
-                        for (; x1 < bw; x1 += 2) {
-                            int32x2_t vX = { X0 + adelta[x + x1], X0 + adelta[x + x1 + 1] };
-                            int32x2_t vY = { Y0 + bdelta[x + x1], Y0 + bdelta[x + x1 + 1] };
-
-                            vX = __nds__v_sclip32(__nds__v_sra32(vX, AB_BITS), 15);
-                            vY = __nds__v_sclip32(__nds__v_sra32(vY, AB_BITS), 15);
-
-                            *(uint16x4_t*)(xy + x1 * 2) = (uint16x4_t)__nds__pkbb16((unsigned long)vY, (unsigned long)vX);
-                        }
-
-                        for (; x1 < bw; x1++) {
-                            int X = (X0 + adelta[x + x1]) >> AB_BITS;
-                            int Y = (Y0 + bdelta[x + x1]) >> AB_BITS;
-                            xy[x1 * 2] = saturate_cast<short>(X);
-                            xy[x1 * 2 + 1] = saturate_cast<short>(Y);
-                        }
-                    } else {
-                        short* alpha = A + y1 * bw;
-                        x1 = 0;
-
-                        const int INTER_MASK = INTER_TAB_SIZE - 1;
-                        const uint32x2_t vmask = { INTER_MASK, INTER_MASK };
-                        for (; x1 < bw; x1 += 2) {
-                            int32x2_t vX = { X0 + adelta[x + x1], X0 + adelta[x + x1 + 1] };
-                            int32x2_t vY = { Y0 + bdelta[x + x1], Y0 + bdelta[x + x1 + 1] };
-                            vX = __nds__v_sra32(vX, (AB_BITS - INTER_BITS));
-                            vY = __nds__v_sra32(vY, (AB_BITS - INTER_BITS));
-
-                            int32x2_t vx = __nds__v_sclip32(__nds__v_sra32(vX, INTER_BITS), 15);
-                            int32x2_t vy = __nds__v_sclip32(__nds__v_sra32(vY, INTER_BITS), 15);
-
-                            *(uint16x4_t*)(xy + x1 * 2) = (uint16x4_t)__nds__pkbb16((unsigned long)vy, (unsigned long)vx);
-
-                            uint32x2_t valpha = __nds__v_uadd32(__nds__v_sll32((uint32x2_t)(vY & vmask), INTER_BITS), (uint32x2_t)(vX & vmask));
-                            *(int16x2_t*)(alpha + x1) = (int16x2_t) { (short)(valpha[0]), (short)(valpha[1]) };
-                        }
-
-                        for (; x1 < bw; x1++) {
-                            int X = (X0 + adelta[x + x1]) >> (AB_BITS - INTER_BITS);
-                            int Y = (Y0 + bdelta[x + x1]) >> (AB_BITS - INTER_BITS);
-                            xy[x1 * 2] = saturate_cast<short>(X >> INTER_BITS);
-                            xy[x1 * 2 + 1] = saturate_cast<short>(Y >> INTER_BITS);
-                            alpha[x1] = (short)((Y & (INTER_TAB_SIZE - 1)) * INTER_TAB_SIZE + (X & (INTER_TAB_SIZE - 1)));
-                        }
-                    }
-                }
-
-                if (interpolation == CV_HAL_INTER_NEAREST)
-                    remap(src, dpart, _XY, Mat(), interpolation, borderType, borderValue);
-                else {
-                    Mat _matA(bh, bw, CV_16U, A);
-                    remap(src, dpart, _XY, _matA, interpolation, borderType, borderValue);
-                }
-            }
-        }
+    for (; x1 < bw; x1++) {
+        int X = X0 + adelta[x1];
+        int Y = Y0 + bdelta[x1];
+        xy[x1 * 2] = saturate_cast<short>(X);
+        xy[x1 * 2 + 1] = saturate_cast<short>(Y);
    }

-private:
-    Mat src;
-    Mat dst;
-    int interpolation, borderType;
-    Scalar borderValue;
-    int *adelta, *bdelta;
-    const double* M;
-};
-
-int warpAffine(int src_type,
-    const uchar* src_data, size_t src_step, int src_width, int src_height,
-    uchar* dst_data, size_t dst_step, int dst_width, int dst_height,
-    const double M[6], int interpolation, int borderType, const double borderValue[4])
-{
-    Mat src(Size(src_width, src_height), src_type, const_cast<uchar*>(src_data), src_step);
-    Mat dst(Size(dst_width, dst_height), src_type, dst_data, dst_step);
+    return CV_HAL_ERROR_OK;
+}

-    int x;
-    AutoBuffer<int> _abdelta(dst.cols * 2);
-    int *adelta = &_abdelta[0], *bdelta = adelta + dst.cols;
+int warpAffineBlockline(int *adelta, int *bdelta, short* xy, short* alpha, int X0, int Y0, int bw)
+{
    const int AB_BITS = MAX(10, (int)INTER_BITS);
-    const int AB_SCALE = 1 << AB_BITS;
+    int x1 = 0;
+
+    const int INTER_MASK = INTER_TAB_SIZE - 1;
+    const uint32x2_t vmask = { INTER_MASK, INTER_MASK };
+    for (; x1 < bw; x1 += 2) {
+        int32x2_t vX = { X0 + adelta[x1], X0 + adelta[x1 + 1] };
+        int32x2_t vY = { Y0 + bdelta[x1], Y0 + bdelta[x1 + 1] };
+        vX = __nds__v_sra32(vX, (AB_BITS - INTER_BITS));
+        vY = __nds__v_sra32(vY, (AB_BITS - INTER_BITS));
+
+        int32x2_t vx = __nds__v_sclip32(__nds__v_sra32(vX, INTER_BITS), 15);
+        int32x2_t vy = __nds__v_sclip32(__nds__v_sra32(vY, INTER_BITS), 15);
+
+        *(uint16x4_t*)(xy + x1 * 2) = (uint16x4_t)__nds__pkbb16((unsigned long)vy, (unsigned long)vx);
+
+        uint32x2_t valpha = __nds__v_uadd32(__nds__v_sll32((uint32x2_t)(vY & vmask), INTER_BITS), (uint32x2_t)(vX & vmask));
+        *(int16x2_t*)(alpha + x1) = (int16x2_t) { (short)(valpha[0]), (short)(valpha[1]) };
+    }

-    for (x = 0; x < dst.cols; x++) {
-        adelta[x] = saturate_cast<int>(M[0] * x * AB_SCALE);
-        bdelta[x] = saturate_cast<int>(M[3] * x * AB_SCALE);
+    for (; x1 < bw; x1++) {
+        int X = X0 + adelta[x1];
+        int Y = Y0 + bdelta[x1];
+        xy[x1 * 2] = saturate_cast<short>(X >> INTER_BITS);
+        xy[x1 * 2 + 1] = saturate_cast<short>(Y >> INTER_BITS);
+        alpha[x1] = (short)((Y & INTER_MASK) * INTER_TAB_SIZE + (X & INTER_MASK));
    }

-    Range range(0, dst.rows);
-    WarpAffineInvoker invoker(src, dst, interpolation, borderType,
-        Scalar(borderValue[0], borderValue[1], borderValue[2], borderValue[3]),
-        adelta, bdelta, M);
-    parallel_for_(range, invoker, dst.total() / (double)(1 << 16));
    return CV_HAL_ERROR_OK;
 }

--- a/3rdparty/ndsrvp/src/warpPerspective.cpp
+++ b/3rdparty/ndsrvp/src/warpPerspective.cpp
@ -3,154 +3,90 @@
 // of this distribution and at http://opencv.org/license.html.	

 #include "ndsrvp_hal.hpp"
-#include "opencv2/core.hpp"
 #include "opencv2/imgproc/hal/interface.h"
+#include "cvutils.hpp"

 namespace cv {

 namespace ndsrvp {

-class WarpPerspectiveInvoker : public ParallelLoopBody {
-public:
-    WarpPerspectiveInvoker(const Mat& _src, Mat& _dst, const double* _M, int _interpolation,
-        int _borderType, const Scalar& _borderValue)
-        : ParallelLoopBody()
-        , src(_src)
-        , dst(_dst)
-        , M(_M)
-        , interpolation(_interpolation)
-        , borderType(_borderType)
-        , borderValue(_borderValue)
-    {
+int warpPerspectiveBlocklineNN(const double *M, short* xy, double X0, double Y0, double W0, int bw)
+{
+    int x1 = 0;
+
+    for (; x1 < bw; x1 += 2) {
+        double W1 = W0 + M[6] * x1, W2 = W1 + M[6];
+        W1 = W1 ? 1. / W1 : 0;
+        W2 = W2 ? 1. / W2 : 0;
+        double fX1 = std::max((double)INT_MIN, std::min((double)INT_MAX, (X0 + M[0] * x1) * W1));
+        double fX2 = std::max((double)INT_MIN, std::min((double)INT_MAX, (X0 + M[0] * (x1 + 1)) * W2));
+        double fY1 = std::max((double)INT_MIN, std::min((double)INT_MAX, (Y0 + M[3] * x1) * W1));
+        double fY2 = std::max((double)INT_MIN, std::min((double)INT_MAX, (Y0 + M[3] * (x1 + 1)) * W2));
+
+        int32x2_t vX = {saturate_cast<int>(fX1), saturate_cast<int>(fX2)};
+        int32x2_t vY = {saturate_cast<int>(fY1), saturate_cast<int>(fY2)};
+
+        vX = __nds__v_sclip32(vX, 15);
+        vY = __nds__v_sclip32(vY, 15);
+
+        *(uint16x4_t*)(xy + x1 * 2) = (uint16x4_t)__nds__pkbb16((unsigned long)vY, (unsigned long)vX);
    }

-    virtual void operator()(const Range& range) const CV_OVERRIDE
-    {
-        const int BLOCK_SZ = 32;
-        short XY[BLOCK_SZ * BLOCK_SZ * 2], A[BLOCK_SZ * BLOCK_SZ];
-        int x, y, y1, width = dst.cols, height = dst.rows;
-
-        int bh0 = std::min(BLOCK_SZ / 2, height);
-        int bw0 = std::min(BLOCK_SZ * BLOCK_SZ / bh0, width);
-        bh0 = std::min(BLOCK_SZ * BLOCK_SZ / bw0, height);
-
-        for (y = range.start; y < range.end; y += bh0) {
-            for (x = 0; x < width; x += bw0) {
-                int bw = std::min(bw0, width - x);
-                int bh = std::min(bh0, range.end - y); // height
-
-                Mat _XY(bh, bw, CV_16SC2, XY);
-                Mat dpart(dst, Rect(x, y, bw, bh));
-
-                for (y1 = 0; y1 < bh; y1++) {
-                    short* xy = XY + y1 * bw * 2;
-                    double X0 = M[0] * x + M[1] * (y + y1) + M[2];
-                    double Y0 = M[3] * x + M[4] * (y + y1) + M[5];
-                    double W0 = M[6] * x + M[7] * (y + y1) + M[8];
-
-                    if (interpolation == CV_HAL_INTER_NEAREST) {
-                        int x1 = 0;
-
-                        for (; x1 < bw; x1 += 2) {
-                            double W1 = W0 + M[6] * x1, W2 = W1 + M[6];
-                            W1 = W1 ? 1. / W1 : 0;
-                            W2 = W2 ? 1. / W2 : 0;
-                            double fX1 = std::max((double)INT_MIN, std::min((double)INT_MAX, (X0 + M[0] * x1) * W1));
-                            double fX2 = std::max((double)INT_MIN, std::min((double)INT_MAX, (X0 + M[0] * (x1 + 1)) * W2));
-                            double fY1 = std::max((double)INT_MIN, std::min((double)INT_MAX, (Y0 + M[3] * x1) * W1));
-                            double fY2 = std::max((double)INT_MIN, std::min((double)INT_MAX, (Y0 + M[3] * (x1 + 1)) * W2));
-
-                            int32x2_t vX = {saturate_cast<int>(fX1), saturate_cast<int>(fX2)};
-                            int32x2_t vY = {saturate_cast<int>(fY1), saturate_cast<int>(fY2)};
-
-                            vX = __nds__v_sclip32(vX, 15);
-                            vY = __nds__v_sclip32(vY, 15);
-
-                            *(uint16x4_t*)(xy + x1 * 2) = (uint16x4_t)__nds__pkbb16((unsigned long)vY, (unsigned long)vX);
-                        }
-
-                        for (; x1 < bw; x1++) {
-                            double W = W0 + M[6] * x1;
-                            W = W ? 1. / W : 0;
-                            double fX = std::max((double)INT_MIN, std::min((double)INT_MAX, (X0 + M[0] * x1) * W));
-                            double fY = std::max((double)INT_MIN, std::min((double)INT_MAX, (Y0 + M[3] * x1) * W));
-                            int X = saturate_cast<int>(fX);
-                            int Y = saturate_cast<int>(fY);
-
-                            xy[x1 * 2] = saturate_cast<short>(X);
-                            xy[x1 * 2 + 1] = saturate_cast<short>(Y);
-                        }
-                    } else {
-                        short* alpha = A + y1 * bw;
-                        int x1 = 0;
-
-                        const int INTER_MASK = INTER_TAB_SIZE - 1;
-                        const uint32x2_t vmask = { INTER_MASK, INTER_MASK };
-                        for (; x1 < bw; x1 += 2) {
-                            double W1 = W0 + M[6] * x1, W2 = W1 + M[6];
-                            W1 = W1 ? INTER_TAB_SIZE / W1 : 0;
-                            W2 = W2 ? INTER_TAB_SIZE / W2 : 0;
-                            double fX1 = std::max((double)INT_MIN, std::min((double)INT_MAX, (X0 + M[0] * x1) * W1));
-                            double fX2 = std::max((double)INT_MIN, std::min((double)INT_MAX, (X0 + M[0] * (x1 + 1)) * W2));
-                            double fY1 = std::max((double)INT_MIN, std::min((double)INT_MAX, (Y0 + M[3] * x1) * W1));
-                            double fY2 = std::max((double)INT_MIN, std::min((double)INT_MAX, (Y0 + M[3] * (x1 + 1)) * W2));
-
-                            int32x2_t vX = {saturate_cast<int>(fX1), saturate_cast<int>(fX2)};
-                            int32x2_t vY = {saturate_cast<int>(fY1), saturate_cast<int>(fY2)};
-
-                            int32x2_t vx = __nds__v_sclip32(__nds__v_sra32(vX, INTER_BITS), 15);
-                            int32x2_t vy = __nds__v_sclip32(__nds__v_sra32(vY, INTER_BITS), 15);
-
-                            *(uint16x4_t*)(xy + x1 * 2) = (uint16x4_t)__nds__pkbb16((unsigned long)vy, (unsigned long)vx);
-
-                            uint32x2_t valpha = __nds__v_uadd32(__nds__v_sll32((uint32x2_t)(vY & vmask), INTER_BITS), (uint32x2_t)(vX & vmask));
-                            *(int16x2_t*)(alpha + x1) = (int16x2_t) { (short)(valpha[0]), (short)(valpha[1]) };
-                        }
-
-                        for (; x1 < bw; x1++) {
-                            double W = W0 + M[6] * x1;
-                            W = W ? INTER_TAB_SIZE / W : 0;
-                            double fX = std::max((double)INT_MIN, std::min((double)INT_MAX, (X0 + M[0] * x1) * W));
-                            double fY = std::max((double)INT_MIN, std::min((double)INT_MAX, (Y0 + M[3] * x1) * W));
-                            int X = saturate_cast<int>(fX);
-                            int Y = saturate_cast<int>(fY);
-
-                            xy[x1 * 2] = saturate_cast<short>(X >> INTER_BITS);
-                            xy[x1 * 2 + 1] = saturate_cast<short>(Y >> INTER_BITS);
-                            alpha[x1] = (short)((Y & (INTER_TAB_SIZE - 1)) * INTER_TAB_SIZE + (X & (INTER_TAB_SIZE - 1)));
-                        }
-                    }
-                }
-
-                if (interpolation == CV_HAL_INTER_NEAREST)
-                    remap(src, dpart, _XY, Mat(), interpolation, borderType, borderValue);
-                else {
-                    Mat _matA(bh, bw, CV_16U, A);
-                    remap(src, dpart, _XY, _matA, interpolation, borderType, borderValue);
-                }
-            }
-        }
+    for (; x1 < bw; x1++) {
+        double W = W0 + M[6] * x1;
+        W = W ? 1. / W : 0;
+        double fX = std::max((double)INT_MIN, std::min((double)INT_MAX, (X0 + M[0] * x1) * W));
+        double fY = std::max((double)INT_MIN, std::min((double)INT_MAX, (Y0 + M[3] * x1) * W));
+        int X = saturate_cast<int>(fX);
+        int Y = saturate_cast<int>(fY);
+
+        xy[x1 * 2] = saturate_cast<short>(X);
+        xy[x1 * 2 + 1] = saturate_cast<short>(Y);
    }

-private:
-    Mat src;
-    Mat dst;
-    const double* M;
-    int interpolation, borderType;
-    Scalar borderValue;
-};
-
-int warpPerspective(int src_type,
-    const uchar* src_data, size_t src_step, int src_width, int src_height,
-    uchar* dst_data, size_t dst_step, int dst_width, int dst_height,
-    const double M[9], int interpolation, int borderType, const double borderValue[4])
+    return CV_HAL_ERROR_OK;
+}
+
+int warpPerspectiveBlockline(const double *M, short* xy, short* alpha, double X0, double Y0, double W0, int bw)
 {
-    Mat src(Size(src_width, src_height), src_type, const_cast<uchar*>(src_data), src_step);
-    Mat dst(Size(dst_width, dst_height), src_type, dst_data, dst_step);
+    int x1 = 0;
+
+    const int INTER_MASK = INTER_TAB_SIZE - 1;
+    const uint32x2_t vmask = { INTER_MASK, INTER_MASK };
+    for (; x1 < bw; x1 += 2) {
+        double W1 = W0 + M[6] * x1, W2 = W1 + M[6];
+        W1 = W1 ? INTER_TAB_SIZE / W1 : 0;
+        W2 = W2 ? INTER_TAB_SIZE / W2 : 0;
+        double fX1 = std::max((double)INT_MIN, std::min((double)INT_MAX, (X0 + M[0] * x1) * W1));
+        double fX2 = std::max((double)INT_MIN, std::min((double)INT_MAX, (X0 + M[0] * (x1 + 1)) * W2));
+        double fY1 = std::max((double)INT_MIN, std::min((double)INT_MAX, (Y0 + M[3] * x1) * W1));
+        double fY2 = std::max((double)INT_MIN, std::min((double)INT_MAX, (Y0 + M[3] * (x1 + 1)) * W2));
+
+        int32x2_t vX = {saturate_cast<int>(fX1), saturate_cast<int>(fX2)};
+        int32x2_t vY = {saturate_cast<int>(fY1), saturate_cast<int>(fY2)};
+
+        int32x2_t vx = __nds__v_sclip32(__nds__v_sra32(vX, INTER_BITS), 15);
+        int32x2_t vy = __nds__v_sclip32(__nds__v_sra32(vY, INTER_BITS), 15);
+
+        *(uint16x4_t*)(xy + x1 * 2) = (uint16x4_t)__nds__pkbb16((unsigned long)vy, (unsigned long)vx);
+
+        uint32x2_t valpha = __nds__v_uadd32(__nds__v_sll32((uint32x2_t)(vY & vmask), INTER_BITS), (uint32x2_t)(vX & vmask));
+        *(int16x2_t*)(alpha + x1) = (int16x2_t) { (short)(valpha[0]), (short)(valpha[1]) };
+    }
+
+    for (; x1 < bw; x1++) {
+        double W = W0 + M[6] * x1;
+        W = W ? INTER_TAB_SIZE / W : 0;
+        double fX = std::max((double)INT_MIN, std::min((double)INT_MAX, (X0 + M[0] * x1) * W));
+        double fY = std::max((double)INT_MIN, std::min((double)INT_MAX, (Y0 + M[3] * x1) * W));
+        int X = saturate_cast<int>(fX);
+        int Y = saturate_cast<int>(fY);
+
+        xy[x1 * 2] = saturate_cast<short>(X >> INTER_BITS);
+        xy[x1 * 2 + 1] = saturate_cast<short>(Y >> INTER_BITS);
+        alpha[x1] = (short)((Y & INTER_MASK) * INTER_TAB_SIZE + (X & INTER_MASK));
+    }

-    Range range(0, dst.rows);
-    WarpPerspectiveInvoker invoker(src, dst, M, interpolation, borderType, Scalar(borderValue[0], borderValue[1], borderValue[2], borderValue[3]));
-    parallel_for_(range, invoker, dst.total() / (double)(1 << 16));
    return CV_HAL_ERROR_OK;
 }

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -1026,7 +1026,7 @@ foreach(hal ${OpenCV_HAL})
      ocv_hal_register(NDSRVP_HAL_LIBRARIES NDSRVP_HAL_HEADERS NDSRVP_HAL_INCLUDE_DIRS)
      list(APPEND OpenCV_USED_HAL "ndsrvp (ver ${NDSRVP_HAL_VERSION})")
    else()
-      message(STATUS "NDSRVP: Andes GNU Toolchain DSP extension is not open, disabling ndsrvp...")
+      message(STATUS "NDSRVP: Andes GNU Toolchain DSP extension is not enabled, disabling ndsrvp...")
    endif()
  elseif(hal STREQUAL "halrvv")
    if(";${CPU_BASELINE_FINAL};" MATCHES ";RVV;")
--- a/cmake/OpenCVCompilerOptimizations.cmake
+++ b/cmake/OpenCVCompilerOptimizations.cmake
@ -171,7 +171,7 @@ elseif(" ${CMAKE_CXX_FLAGS} " MATCHES " -march=native | -xHost | /QxHost ")
 endif()

 if(X86 OR X86_64)
-  ocv_update(CPU_KNOWN_OPTIMIZATIONS "SSE;SSE2;SSE3;SSSE3;SSE4_1;POPCNT;SSE4_2;FP16;FMA3;AVX;AVX2;AVX_512F;AVX512_COMMON;AVX512_KNL;AVX512_KNM;AVX512_SKX;AVX512_CNL;AVX512_CLX;AVX512_ICL")
+  ocv_update(CPU_KNOWN_OPTIMIZATIONS "SSE;SSE2;SSE3;SSSE3;SSE4_1;POPCNT;SSE4_2;AVX;FP16;AVX2;FMA3;AVX_512F;AVX512_COMMON;AVX512_KNL;AVX512_KNM;AVX512_SKX;AVX512_CNL;AVX512_CLX;AVX512_ICL")

  ocv_update(CPU_AVX512_COMMON_GROUP "AVX_512F;AVX_512CD")
  ocv_update(CPU_AVX512_KNL_GROUP "AVX512_COMMON;AVX512_KNL_EXTRA")
@ -440,7 +440,7 @@ macro(ocv_check_compiler_optimization OPT)
      set(_varname "")
      if(CPU_${OPT}_TEST_FILE)
        set(__available 0)
-        if(CPU_BASELINE_DETECT)
+        if(__is_from_baseline OR CPU_BASELINE_DETECT)
          set(_varname "HAVE_CPU_${OPT}_SUPPORT")
          ocv_check_compiler_flag(CXX "${CPU_BASELINE_FLAGS}" "${_varname}" "${CPU_${OPT}_TEST_FILE}")
          if(${_varname})
--- a/cmake/checks/cpu_sse2.cpp
+++ b/cmake/checks/cpu_sse2.cpp
@ -1,2 +1,16 @@
 #include <emmintrin.h>
-int main() { return 0; }
+
+inline __m128i _v128_comgt_epu32(const __m128i& a, const __m128i& b)
+{
+    const __m128i delta = _mm_set1_epi32((int)0x80000000);
+    return _mm_cmpgt_epi32(_mm_xor_si128(a, delta), _mm_xor_si128(b, delta));
+}
+
+int main()
+{
+    __m128i a, b, c;
+    a = _mm_set1_epi32(0x00000000);
+    b = _mm_set1_epi32(0x0000ffff);
+    c = _v128_comgt_epu32(a, b);
+    return 0;
+}
--- a/modules/3d/include/opencv2/3d.hpp
+++ b/modules/3d/include/opencv2/3d.hpp
@ -2512,10 +2512,25 @@ the number of points in the view.
@param distorted Output array of image points, 1xN/Nx1 2-channel, or vector\<Point2f\> .

 Note that the function assumes the camera intrinsic matrix of the undistorted points to be identity.
-This means if you want to distort image points you have to multiply them with \f$K^{-1}\f$.
+This means if you want to distort image points you have to multiply them with \f$K^{-1}\f$ or
+use another function overload.
 */
 CV_EXPORTS_W void distortPoints(InputArray undistorted, OutputArray distorted, InputArray K, InputArray D, double alpha = 0);

+/** @overload
+Overload of distortPoints function to handle cases when undistorted points are got with non-identity
+camera matrix, e.g. output of #estimateNewCameraMatrixForUndistortRectify.
+@param undistorted Array of object points, 1xN/Nx1 2-channel (or vector\<Point2f\> ), where N is
+the number of points in the view.
+@param Kundistorted Camera intrinsic matrix used as new camera matrix for undistortion.
+@param K Camera intrinsic matrix \f$cameramatrix{K}\f$.
+@param D Input vector of distortion coefficients \f$\distcoeffsfisheye\f$.
+@param alpha The skew coefficient.
+@param distorted Output array of image points, 1xN/Nx1 2-channel, or vector\<Point2f\> .
+@sa estimateNewCameraMatrixForUndistortRectify
+*/
+CV_EXPORTS_W void distortPoints(InputArray undistorted, OutputArray distorted, InputArray Kundistorted, InputArray K, InputArray D, double alpha = 0);
+
 /** @brief Undistorts 2D points using fisheye model

@param distorted Array of object points, 1xN/Nx1 2-channel (or vector\<Point2f\> ), where N is the
--- a/modules/3d/src/fisheye.cpp
+++ b/modules/3d/src/fisheye.cpp
@ -266,6 +266,48 @@ void cv::fisheye::distortPoints(InputArray undistorted, OutputArray distorted, I
    }
 }

+void cv::fisheye::distortPoints(InputArray _undistorted, OutputArray distorted, InputArray Kundistorted, InputArray K, InputArray D, double alpha)
+{
+    CV_INSTRUMENT_REGION();
+
+    CV_Assert(_undistorted.type() == CV_32FC2 || _undistorted.type() == CV_64FC2);
+    CV_Assert(Kundistorted.size() == Size(3,3) && (Kundistorted.type() == CV_32F || Kundistorted.type() == CV_64F));
+
+    cv::Mat undistorted = _undistorted.getMat();
+    cv::Mat normalized(undistorted.size(), CV_64FC2);
+
+    Mat Knew = Kundistorted.getMat();
+
+    double cx, cy, fx, fy;
+    if (Knew.depth() == CV_32F)
+    {
+        fx = (double)Knew.at<float>(0, 0);
+        fy = (double)Knew.at<float>(1, 1);
+        cx = (double)Knew.at<float>(0, 2);
+        cy = (double)Knew.at<float>(1, 2);
+    }
+    else
+    {
+        fx = Knew.at<double>(0, 0);
+        fy = Knew.at<double>(1, 1);
+        cx = Knew.at<double>(0, 2);
+        cy = Knew.at<double>(1, 2);
+    }
+
+    size_t n = undistorted.total();
+    const Vec2f* Xf = undistorted.ptr<Vec2f>();
+    const Vec2d* Xd = undistorted.ptr<Vec2d>();
+    Vec2d* normXd = normalized.ptr<Vec2d>();
+    for (size_t i = 0; i < n; i++)
+    {
+        Vec2d p = undistorted.depth() == CV_32F ? (Vec2d)Xf[i] : Xd[i];
+        normXd[i][0] = (p[0] - cx) / fx;
+        normXd[i][1] = (p[1] - cy) / fy;
+    }
+
+    cv::fisheye::distortPoints(normalized, distorted, K, D, alpha);
+}
+
 void cv::fisheye::undistortPoints( InputArray distorted, OutputArray undistorted, InputArray K, InputArray D,
                                   InputArray R, InputArray P, TermCriteria criteria)
 {
--- a/modules/3d/test/test_fisheye.cpp
+++ b/modules/3d/test/test_fisheye.cpp
@ -86,7 +86,6 @@ TEST_F(fisheyeTest, distortUndistortPoints)
    int height = imageSize.height;

    /* Create test points */
-    std::vector<cv::Point2d> points0Vector;
    cv::Mat principalPoints = (cv::Mat_<double>(5, 2) << K(0, 2), K(1, 2), // (cx, cy)
                                                                    /* Image corners */
                                                                    0, 0,
@ -129,6 +128,95 @@ TEST_F(fisheyeTest, distortUndistortPoints)
    }
 }

+TEST_F(fisheyeTest, distortUndistortPointsNewCameraFixed)
+{
+    int width = imageSize.width;
+    int height = imageSize.height;
+
+    /* Random points inside image */
+    cv::Mat xy[2] = {};
+    xy[0].create(100, 1, CV_64F);
+    theRNG().fill(xy[0], cv::RNG::UNIFORM, 0, width); // x
+    xy[1].create(100, 1, CV_64F);
+    theRNG().fill(xy[1], cv::RNG::UNIFORM, 0, height); // y
+
+    cv::Mat randomPoints;
+    merge(xy, 2, randomPoints);
+
+    cv::Mat points0 = randomPoints;
+    cv::Mat Reye = cv::Mat::eye(3, 3, CV_64FC1);
+
+    cv::Mat Knew;
+    cv::fisheye::estimateNewCameraMatrixForUndistortRectify(K, D, imageSize, Reye,  Knew);
+
+    /* Distort -> Undistort */
+    cv::Mat distortedPoints;
+    cv::fisheye::distortPoints(points0, distortedPoints, Knew, K, D);
+    cv::Mat undistortedPoints;
+    cv::fisheye::undistortPoints(distortedPoints, undistortedPoints, K, D, Reye, Knew);
+
+    EXPECT_MAT_NEAR(points0, undistortedPoints, 1e-8);
+
+    /* Undistort -> Distort */
+    cv::fisheye::undistortPoints(points0, undistortedPoints, K, D, Reye, Knew);
+    cv::fisheye::distortPoints(undistortedPoints, distortedPoints, Knew, K, D);
+
+    EXPECT_MAT_NEAR(points0, distortedPoints, 1e-8);
+}
+
+TEST_F(fisheyeTest, distortUndistortPointsNewCameraRandom)
+{
+    int width = imageSize.width;
+    int height = imageSize.height;
+
+    /* Create test points */
+    std::vector<cv::Point2d> points0Vector;
+    cv::Mat principalPoints = (cv::Mat_<double>(5, 2) << K(0, 2), K(1, 2), // (cx, cy)
+                                                                    /* Image corners */
+                                                                    0, 0,
+                                                                    0, height,
+                                                                    width, 0,
+                                                                    width, height
+                                                                    );
+
+    /* Random points inside image */
+    cv::Mat xy[2] = {};
+    xy[0].create(100, 1, CV_64F);
+    theRNG().fill(xy[0], cv::RNG::UNIFORM, 0, width); // x
+    xy[1].create(100, 1, CV_64F);
+    theRNG().fill(xy[1], cv::RNG::UNIFORM, 0, height); // y
+
+    cv::Mat randomPoints;
+    merge(xy, 2, randomPoints);
+
+    cv::Mat points0;
+    cv::Mat Reye = cv::Mat::eye(3, 3, CV_64FC1);
+    cv::vconcat(principalPoints.reshape(2), randomPoints, points0);
+
+    /* Test with random D set */
+    for (size_t i = 0; i < 10; ++i) {
+        cv::Mat distortion(1, 4, CV_64F);
+        theRNG().fill(distortion, cv::RNG::UNIFORM, -0.001, 0.001);
+
+        cv::Mat Knew;
+        cv::fisheye::estimateNewCameraMatrixForUndistortRectify(K, distortion, imageSize, Reye,  Knew);
+
+        /* Distort -> Undistort */
+        cv::Mat distortedPoints;
+        cv::fisheye::distortPoints(points0, distortedPoints, Knew, K, distortion);
+        cv::Mat undistortedPoints;
+        cv::fisheye::undistortPoints(distortedPoints, undistortedPoints, K, distortion, Reye, Knew);
+
+        EXPECT_MAT_NEAR(points0, undistortedPoints, 1e-8);
+
+        /* Undistort -> Distort */
+        cv::fisheye::undistortPoints(points0, undistortedPoints, K, distortion, Reye, Knew);
+        cv::fisheye::distortPoints(undistortedPoints, distortedPoints, Knew, K, distortion);
+
+        EXPECT_MAT_NEAR(points0, distortedPoints, 1e-8);
+    }
+}
+
 TEST_F(fisheyeTest, solvePnP)
 {
    const int n = 16;
--- a/modules/calib/src/calibinit.cpp
+++ b/modules/calib/src/calibinit.cpp
@ -221,7 +221,7 @@ public:
    int all_quads_count;

    struct NeighborsFinder {
-        const float thresh_scale = 1.f;
+        const float thresh_scale = sqrt(2.f);
        ChessBoardDetector& detector;
        std::vector<int> neighbors_indices;
        std::vector<float> neighbors_dists;
@ -231,8 +231,9 @@ public:
        NeighborsFinder(ChessBoardDetector& detector);

        bool findCornerNeighbor(
-            const int idx,
-            const cv::Point2f& pt,
+            const int quad_idx,
+            const int corner_idx,
+            const cv::Point2f& corner_pt,
            float& min_dist,
            const float radius,
            int& closest_quad_idx,
@ -513,9 +514,23 @@ ChessBoardDetector::NeighborsFinder::NeighborsFinder(ChessBoardDetector& _detect
    neighbors_dists.resize(all_corners_count);
 }

+static double pointSideFromLine(const Point2f& line_direction_vector, const Point2f& vector)
+{
+    return line_direction_vector.cross(vector);
+}
+
+static bool arePointsOnSameSideFromLine(const Point2f& line_pt1, const Point2f& line_pt2, const Point2f& pt1, const Point2f& pt2)
+{
+    const Point2f line_direction_vector = line_pt2 - line_pt1;
+    const Point2f vector1 = pt1 - line_pt1;
+    const Point2f vector2 = pt2 - line_pt1;
+    return pointSideFromLine(line_direction_vector, vector1) * pointSideFromLine(line_direction_vector, vector2) > 0.;
+}
+
 bool ChessBoardDetector::NeighborsFinder::findCornerNeighbor(
-    const int idx,
-    const cv::Point2f& pt,
+    const int quad_idx,
+    const int corner_idx,
+    const cv::Point2f& corner_pt,
    float& min_dist,
    const float radius,
    int& closest_quad_idx,
@ -524,12 +539,12 @@ bool ChessBoardDetector::NeighborsFinder::findCornerNeighbor(
 {
    ChessBoardQuad* p_all_quads = detector.all_quads.data();

-    const ChessBoardQuad& cur_quad = (const ChessBoardQuad&)p_all_quads[idx];
+    const ChessBoardQuad& cur_quad = (const ChessBoardQuad&)p_all_quads[quad_idx];
    int closest_neighbor_idx = -1;
    ChessBoardQuad *closest_quad = 0;

    // find the closest corner in all other quadrangles
-    const std::vector<float> query = { pt.x, pt.y };
+    const std::vector<float> query = { corner_pt.x, corner_pt.y };
    const cvflann::SearchParams search_params(-1);
    const int neighbors_count = all_quads_pts_index.radiusSearch(query, neighbors_indices, neighbors_dists, radius, search_params);

@ -537,7 +552,7 @@ bool ChessBoardDetector::NeighborsFinder::findCornerNeighbor(
    {
        const int neighbor_idx = neighbors_indices[neighbor_idx_idx];
        const int k = neighbor_idx >> 2;
-        if (k == idx)
+        if (k == quad_idx)
            continue;

        ChessBoardQuad& q_k = p_all_quads[k];
@ -545,7 +560,8 @@ bool ChessBoardDetector::NeighborsFinder::findCornerNeighbor(
        if (q_k.neighbors[j])
            continue;

-        const float dist = normL2Sqr<float>(pt - all_quads_pts[neighbor_idx]);
+        const Point2f neighbor_pt = all_quads_pts[neighbor_idx];
+        const float dist = normL2Sqr<float>(corner_pt - neighbor_pt);
        if (dist <= cur_quad.edge_len * thresh_scale &&
            dist <= q_k.edge_len * thresh_scale)
        {
@ -559,6 +575,24 @@ bool ChessBoardDetector::NeighborsFinder::findCornerNeighbor(
                DPRINTF("Incompatible edge lengths");
                continue;
            }
+
+            const Point2f mid_pt1 = (cur_quad.corners[corner_idx]->pt + cur_quad.corners[(corner_idx + 1) & 3]->pt) / 2.f;
+            const Point2f mid_pt2 = (cur_quad.corners[(corner_idx + 2) & 3]->pt + cur_quad.corners[(corner_idx + 3) & 3]->pt) / 2.f;
+            if (!arePointsOnSameSideFromLine(mid_pt1, mid_pt2, corner_pt, neighbor_pt))
+                continue;
+
+            const Point2f mid_pt3 = (cur_quad.corners[(corner_idx + 1) & 3]->pt + cur_quad.corners[(corner_idx + 2) & 3]->pt) / 2.f;
+            const Point2f mid_pt4 = (cur_quad.corners[(corner_idx + 3) & 3]->pt + cur_quad.corners[corner_idx]->pt) / 2.f;
+            if (!arePointsOnSameSideFromLine(mid_pt3, mid_pt4, corner_pt, neighbor_pt))
+                continue;
+
+            const Point2f neighbor_pt_diagonal = q_k.corners[(j + 2) & 3]->pt;
+            if (!arePointsOnSameSideFromLine(mid_pt1, mid_pt2, corner_pt, neighbor_pt_diagonal))
+                continue;
+
+            if (!arePointsOnSameSideFromLine(mid_pt3, mid_pt4, neighbor_pt, neighbor_pt_diagonal))
+                continue;
+
            closest_neighbor_idx = neighbor_idx;
            closest_quad_idx = k;
            closest_corner_idx = j;
@ -588,7 +622,7 @@ bool ChessBoardDetector::NeighborsFinder::findCornerNeighbor(
            if (cur_quad.neighbors[j] == closest_quad)
                break;

-            if (normL2Sqr<float>(closest_corner_pt - all_quads_pts[(idx << 2) + j]) < min_dist)
+            if (normL2Sqr<float>(closest_corner_pt - all_quads_pts[(quad_idx << 2) + j]) < min_dist)
                break;
        }
        if (j < 4)
@ -1792,6 +1826,7 @@ void ChessBoardDetector::findQuadNeighbors()

            bool found = neighborsFinder.findCornerNeighbor(
                idx,
+                i,
                pt,
                min_dist,
                radius,
@ -1812,6 +1847,7 @@ void ChessBoardDetector::findQuadNeighbors()

            found = neighborsFinder.findCornerNeighbor(
                closest_quad_idx,
+                closest_corner_idx,
                closest_corner_pt,
                min_dist,
                radius,
--- a/modules/core/include/opencv2/core/cuda.hpp
+++ b/modules/core/include/opencv2/core/cuda.hpp
@ -118,6 +118,7 @@ public:
    //! default allocator
    CV_WRAP static GpuMat::Allocator* defaultAllocator();
    CV_WRAP static void setDefaultAllocator(GpuMat::Allocator* allocator);
+    CV_WRAP static GpuMat::Allocator* getStdAllocator();

    //! default constructor
    CV_WRAP explicit GpuMat(GpuMat::Allocator* allocator = GpuMat::defaultAllocator());
--- a/modules/core/src/cuda/gpu_mat.cu
+++ b/modules/core/src/cuda/gpu_mat.cu
@ -135,6 +135,7 @@ namespace

    DefaultAllocator cudaDefaultAllocator;
    GpuMat::Allocator* g_defaultAllocator = &cudaDefaultAllocator;
+    GpuMat::Allocator* g_stdAllocator = &cudaDefaultAllocator;
 }

 GpuMat::Allocator* cv::cuda::GpuMat::defaultAllocator()
@ -148,6 +149,12 @@ void cv::cuda::GpuMat::setDefaultAllocator(Allocator* allocator)
    g_defaultAllocator = allocator;
 }

+GpuMat::Allocator* cv::cuda::GpuMat::getStdAllocator()
+{
+    return g_stdAllocator;
+}
+
+
 /////////////////////////////////////////////////////
 /// create

--- a/modules/core/src/cuda_gpu_mat.cpp
+++ b/modules/core/src/cuda_gpu_mat.cpp
@ -420,6 +420,11 @@ void cv::cuda::GpuMat::setDefaultAllocator(Allocator* allocator)
    throw_no_cuda();
 }

+GpuMat::Allocator* cv::cuda::GpuMat::getStdAllocator()
+{
+    return 0;
+}
+
 void cv::cuda::GpuMat::create(int _rows, int _cols, int _type)
 {
    CV_UNUSED(_rows);
--- a/modules/core/src/matrix.cpp
+++ b/modules/core/src/matrix.cpp
@ -718,16 +718,13 @@ void Mat::create(int d0, const int* _sizes, int _type)
    if( total() > 0 )
    {
        MatAllocator *a = allocator, *a0 = getDefaultAllocator();
-#ifdef HAVE_TGPU
-        if( !a || a == tegra::getAllocator() )
-            a = tegra::getAllocator(d, _sizes, _type);
-#endif
        if(!a)
            a = a0;
        try
        {
            u = a->allocate(dims, size, _type, 0, step.p, ACCESS_RW /* ignored */, USAGE_DEFAULT);
            CV_Assert(u != 0);
+            allocator = a;
        }
        catch (...)
        {
@ -735,6 +732,7 @@ void Mat::create(int d0, const int* _sizes, int _type)
                throw;
            u = a0->allocate(dims, size, _type, 0, step.p, ACCESS_RW /* ignored */, USAGE_DEFAULT);
            CV_Assert(u != 0);
+            allocator = a0;
        }
        CV_Assert( step[dims-1] == (size_t)CV_ELEM_SIZE(flags) );
    }
--- a/modules/core/src/persistence_xml.cpp
+++ b/modules/core/src/persistence_xml.cpp
@ -737,6 +737,8 @@ public:
                if( c != '\"' && c != '\'' )
                {
                    ptr = skipSpaces( ptr, CV_XML_INSIDE_TAG );
+                    if(!ptr)
+                        CV_PARSE_ERROR_CPP("Invalid attribute value");
                    if( *ptr != '\"' && *ptr != '\'' )
                        CV_PARSE_ERROR_CPP( "Attribute value should be put into single or double quotes" );
                }
--- a/modules/core/test/test_allocator.cpp
+++ b/modules/core/test/test_allocator.cpp
@ -0,0 +1,145 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+#include "test_precomp.hpp"
+#include "opencv2/core/core_c.h" // needed for CV_AUTOSTEP
+
+namespace opencv_test { namespace {
+
+// Dummy allocator implementation copied from the default OpenCV allocator with some simplifications
+struct DummyAllocator: public cv::MatAllocator
+{
+public:
+    DummyAllocator() {};
+    ~DummyAllocator() {};
+
+    cv::UMatData* allocate(int dims, const int* sizes, int type,
+                    void* data0, size_t* step, cv::AccessFlag flags,
+                    cv::UMatUsageFlags usageFlags) const
+    {
+        CV_UNUSED(flags);
+        CV_UNUSED(usageFlags);
+
+        size_t total = CV_ELEM_SIZE(type);
+        for( int i = dims-1; i >= 0; i-- )
+        {
+            if( step )
+            {
+                if( data0 && step[i] != CV_AUTOSTEP )
+                {
+                    CV_Assert(total <= step[i]);
+                    total = step[i];
+                }
+                else
+                    step[i] = total;
+            }
+            total *= sizes[i];
+        }
+
+        uchar* data = nullptr;
+        if (data0)
+        {
+            data = (uchar*)data0;
+        }
+        else
+        {
+            data = new uchar[total];
+            DummyAllocator::allocatedBytes += total;
+            DummyAllocator::allocations++;
+        }
+        cv::UMatData* u = new cv::UMatData(this);
+        u->data = u->origdata = data;
+        u->size = total;
+        if(data0)
+            u->flags |= cv::UMatData::USER_ALLOCATED;
+
+        return u;
+    }
+
+    bool allocate(cv::UMatData* u, cv::AccessFlag accessFlags, cv::UMatUsageFlags usageFlags) const
+    {
+        CV_UNUSED(accessFlags);
+        CV_UNUSED(usageFlags);
+
+        if(!u) return false;
+        return true;
+    }
+
+    void deallocate(cv::UMatData* u) const
+    {
+        if(!u)
+            return;
+
+        CV_Assert(u->urefcount == 0);
+        CV_Assert(u->refcount == 0);
+        if( !(u->flags & cv::UMatData::USER_ALLOCATED) )
+        {
+            delete[] u->origdata;
+            DummyAllocator::deallocations++;
+            u->origdata = 0;
+        }
+        delete u;
+    }
+
+    static size_t allocatedBytes;
+    static int allocations;
+    static int deallocations;
+};
+
+size_t DummyAllocator::allocatedBytes = 0;
+int  DummyAllocator::allocations = 0;
+int  DummyAllocator::deallocations = 0;
+
+cv::MatAllocator* getDummyAllocator()
+{
+    static cv::MatAllocator* allocator = new DummyAllocator;
+    return allocator;
+}
+
+struct AllocatorTest : public testing::Test {
+    void SetUp() override {
+        cv::MatAllocator* allocator = getDummyAllocator();
+        EXPECT_TRUE(allocator != nullptr);
+        cv::Mat::setDefaultAllocator(allocator);
+    }
+
+    void TearDown() override {
+        cv::Mat::setDefaultAllocator(cv::Mat::getStdAllocator());
+    }
+};
+
+TEST_F(AllocatorTest, DummyAllocator)
+{
+    cv::MatAllocator* dummy = getDummyAllocator();
+
+    DummyAllocator::allocatedBytes = 0;
+    DummyAllocator::allocations = 0;
+    DummyAllocator::deallocations = 0;
+
+    {
+        cv::Mat src1 = cv::Mat::ones (16, 16, CV_8UC1);
+        EXPECT_TRUE(!src1.empty());
+        EXPECT_EQ(src1.allocator, dummy);
+
+        cv::Mat src1_roi = src1(cv::Rect(2,2,8,8));
+        EXPECT_EQ(src1_roi.allocator, dummy);
+
+        cv::MatAllocator* standard = cv::Mat::getStdAllocator();
+        cv::Mat::setDefaultAllocator(standard);
+        cv::Mat src2 = cv::Mat::ones (16, 16, CV_8UC1);
+        EXPECT_TRUE(!src2.empty());
+        EXPECT_EQ(src2.allocator, standard);
+
+        src1.create(32, 32, CV_8UC1);
+        EXPECT_EQ(src1.allocator, dummy);
+    }
+
+    size_t expect_allocated = 16*16*sizeof(uchar) + 32*32*sizeof(uchar);
+    EXPECT_EQ(expect_allocated, DummyAllocator::allocatedBytes);
+
+    // ROI should not trigger extra allocations
+    EXPECT_EQ(2, DummyAllocator::allocations);
+    EXPECT_EQ(2, DummyAllocator::deallocations);
+}
+
+}} // namespace
--- a/modules/core/test/test_io.cpp
+++ b/modules/core/test/test_io.cpp
@ -2007,4 +2007,22 @@ INSTANTIATE_TEST_CASE_P( /*nothing*/,
    Core_InputOutput_regression_25073,
    Values("test.json", "test.xml", "test.yml") );

+// see https://github.com/opencv/opencv/issues/25946
+TEST(Core_InputOutput, FileStorage_invalid_attribute_value_regression_25946)
+{
+    const std::string fileName = cv::tempfile("FileStorage_invalid_attribute_value_exception_test.xml");
+    const std::string content = "<?xml \n_=";
+
+    std::fstream testFile;
+    testFile.open(fileName.c_str(), std::fstream::out);
+    if(!testFile.is_open()) FAIL();
+    testFile << content;
+    testFile.close();
+
+    FileStorage fs;
+    EXPECT_ANY_THROW( fs.open(fileName, FileStorage::READ + FileStorage::FORMAT_XML) );
+
+    ASSERT_EQ(0, std::remove(fileName.c_str()));
+}
+
 }} // namespace
--- a/modules/core/test/test_utils.cpp
+++ b/modules/core/test/test_utils.cpp
@ -345,7 +345,7 @@ TEST(Samples, findFile)
 {
    cv::utils::logging::LogLevel prev = cv::utils::logging::setLogLevel(cv::utils::logging::LOG_LEVEL_VERBOSE);
    cv::String path;
-    ASSERT_NO_THROW(path = samples::findFile("lena.jpg", false));
+    ASSERT_NO_THROW(path = samples::findFile("HappyFish.jpg", false));
    EXPECT_NE(std::string(), path.c_str());
    cv::utils::logging::setLogLevel(prev);
 }
--- a/modules/dnn/src/int8layers/quantization_utils.cpp
+++ b/modules/dnn/src/int8layers/quantization_utils.cpp
@ -15,7 +15,10 @@ namespace dnn
 static void broadcast1D2TargetMat(Mat& data, const MatShape& targetShape, int axis)
 {
    // The data is the 1-D scales or zeropoints.
-    CV_Assert(axis >= 0 && targetShape.size() > axis && data.total() == targetShape[axis]);
+    CV_CheckGE(axis, 0, "Quantization axis must be non-negative.");
+    CV_CheckGT((int)targetShape.size(),axis,"Quantization axis must be within the valid range of target shape dimensions.");
+    CV_CheckEQ((int)data.total(), (int)targetShape[axis], "Data total size must match the size of the specified target dimension.");
+
    std::vector<int> broadcast_axes;
    for (int i = 0; i < targetShape.size(); i++)
    {
@ -35,29 +38,98 @@ static void broadcast1D2TargetMat(Mat& data, const MatShape& targetShape, int ax
    }
 }

-static void broadcastScaleAndZeropoint(Mat& scalesMat, Mat& zeropointsMat, const std::vector<float>& scales,
-                                       const std::vector<int>& zeropoints, const MatShape& targetShape, int axis)
+static void block_repeat(InputArray src, const MatShape& srcShape, int axis, int repetitions, OutputArray dst)
 {
-    // broad cast the scales and zeropoint to the input shape.
-    MatShape subTargetShape(targetShape.size(), 1);
-    subTargetShape[axis] = scales.size();
+    CV_Assert(src.getObj() != dst.getObj());
+    CV_Check(axis, axis >= 0 && axis < src.dims(), "Axis out of range");
+    CV_CheckGT(repetitions, 1, "More than one repetition expected");

-    zeropointsMat.create(subTargetShape.size(), subTargetShape.data(), CV_32FC1);
-    scalesMat.create(subTargetShape.size(), subTargetShape.data(), CV_32FC1);
+    Mat src_mat = src.getMat();
+    Mat dst_mat;

-    const int len = scales.size();
-    // Deep copy the scales and zeropoint data and prevent the original data from being changed.
+    if (src_mat.depth() != CV_32F)
+        src_mat.convertTo(src_mat, CV_32F);

-    float * scalePtr = scalesMat.ptr<float>(0);
-    for (int i = 0; i < len; i++)
-        scalePtr[i] = scales[i];
+    MatShape sshape = srcShape;
+    MatShape dshape = srcShape;
+
+    size_t dtype_bytes = src_mat.elemSize();
+    int chunk_size = dtype_bytes;
+    int num_chunks = 1;
+
+    dshape[axis] *= repetitions;
+
+    for (int i = axis+1; i < sshape.size(); ++i)
+        chunk_size*=sshape[i];
+
+    for (int i = 0; i <= axis; ++i)
+        num_chunks*=sshape[i];
+
+    dst.create(dshape.size(), dshape.data(), src_mat.type());
+    dst_mat = dst.getMat();
+
+    CV_Assert(dst_mat.isContinuous());
+    CV_Assert(src_mat.isContinuous());
+
+    for (int i = 0; i < repetitions; ++i) {
+        size_t src_offset = 0;
+        size_t dst_offset = i * chunk_size;
+
+        for (int j = 0; j < num_chunks; ++j) {
+            memcpy(dst_mat.data + dst_offset, src_mat.data + src_offset, chunk_size);
+            src_offset += chunk_size;
+            dst_offset += chunk_size * repetitions;
+        }
+    }
+}
+
+template <typename T>
+static void copyVecToMat(Mat& mat, const std::vector<T>& data){
+    float * matPtr = mat.ptr<float>(0);
+    const int len = data.size();

-    float * zpPtr = zeropointsMat.ptr<float>(0);
    for (int i = 0; i < len; i++)
-        zpPtr[i] = (float )zeropoints[i];
+        matPtr[i] = (float) data[i];
+}

-    broadcast1D2TargetMat(scalesMat, targetShape, axis);
-    broadcast1D2TargetMat(zeropointsMat, targetShape, axis);
+template <typename T>
+static void broadcastBlockedMatrix(Mat& mat, const std::vector<T>& data, const MatShape& targetShape, int axis, int block_size){
+    CV_Check(block_size, targetShape[axis] % block_size == 0 && block_size <= targetShape[axis], "Block size must be a divisor of the target dimension size and not exceed it.");
+
+    MatShape subTargetShape(targetShape);
+    subTargetShape[axis] = static_cast<int>(subTargetShape[axis] / block_size);
+
+    block_repeat(data, subTargetShape, axis, block_size, mat);
+}
+
+template <typename T>
+static void broadcastStandardMatrix(Mat& mat, const std::vector<T>& data, const MatShape& targetShape, int axis)
+{
+    MatShape subTargetShape(targetShape.size(), 1);
+    subTargetShape[axis] = data.size();
+    mat.create(subTargetShape.size(), subTargetShape.data(), CV_32FC1);
+
+    copyVecToMat(mat,data);
+
+    broadcast1D2TargetMat(mat, targetShape, axis);
+}
+
+
+static void broadcastScaleAndZeropoint(Mat& scalesMat, Mat& zeropointsMat, const std::vector<float>& scales,
+                                       const std::vector<int>& zeropoints, const MatShape& targetShape, int axis, int block_size)
+{
+    // broad cast the scales and zeropoint to the input shape.
+
+    if (block_size == 0)
+    {
+        broadcastStandardMatrix(zeropointsMat, zeropoints, targetShape, axis);
+        broadcastStandardMatrix(scalesMat, scales, targetShape, axis);
+    }
+    else
+    {
+        broadcastBlockedMatrix(zeropointsMat, zeropoints, targetShape, axis, block_size);
+        broadcastBlockedMatrix(scalesMat, scales, targetShape, axis, block_size);
+    }
 }

 // Quantize FP32/FP16 Inputs to INT8
@ -65,13 +137,17 @@ class QuantizeLayerImpl CV_FINAL : public QuantizeLayer
 {
 public:
    int axis;
+    int block_size;
    bool is1D;
-    Mat scalesMat, zeropointsMat; // Saving the broadcasetd scales data.
+    Mat scalesMat, zeropointsMat; // Saving the broadcasted scales data.
+    bool quantParamExternal = true;  // Indicates if the quantization parameters (scale and zero point) are provided as inputs to the node.

    QuantizeLayerImpl(const LayerParams& params)
    {
        is1D = params.get<bool>("is1D", false);
        axis = params.get<int>("axis", 1);
+        block_size = params.get<int>("block_size", 0);
+
        if (!is1D)
        {
            scales.push_back(params.get<float>("scales", 1.0f));
@ -82,7 +158,7 @@ public:
            DictValue paramScales = params.get("scales");
            int i, n = paramScales.size();

-            CV_Assert(n > 0);
+            CV_CheckGT(n, 0, "Scale missing.");
            scales.resize(n, 0.);
            for (i = 0; i < n; i++)
                scales[i] = paramScales.get<float>(i);
@ -108,7 +184,7 @@ public:
                         std::vector<MatShape> &outputs,
                         std::vector<MatShape> &internals) const CV_OVERRIDE
    {
-        CV_Assert(inputs.size() == 1);
+        CV_Check(inputs.size(), inputs.size() >= 1 && inputs.size() <= 3, "Number of inputs must be between 1 and 3 inclusive.");
        Layer::getMemoryShapes(inputs, requiredOutputs, outputs, internals);
        return false;
    }
@ -134,7 +210,7 @@ public:
        if (is1D)
        {
            MatShape inputShape = shape(inputs[0]);
-            broadcastScaleAndZeropoint(scalesMat, zeropointsMat, scales, zeropoints, inputShape, axis);
+            broadcastScaleAndZeropoint(scalesMat, zeropointsMat, scales, zeropoints, inputShape, axis, block_size);
        }
    }

@ -156,6 +232,39 @@ public:
        return true;
    }
 #endif
+    void processInputOutput(std::vector<Mat>& inputs, std::vector<Mat>& outputs)
+    {
+        CV_Check(inputs.size(), inputs.size() >= 1 && inputs.size() <= 3, "Number of inputs must be between 1 and 3 inclusive.");
+        quantParamExternal &= inputs.size() > 1;
+
+        // Scale and zeropoint taken as input
+        if (quantParamExternal)
+        {
+            quantParamExternal = false;
+            scalesMat = inputs[1];
+
+            scalesMat.reshape(1, 1).copyTo(scales);
+
+            if(scalesMat.total() > 1) is1D = true;
+
+
+            if (inputs.size() > 2)
+            {
+                zeropointsMat = inputs[2];
+                CV_CheckEQ((int)zeropointsMat.total(), (int)scalesMat.total(), "Scale and zero point elements number must match.");
+                zeropointsMat.reshape(1, 1).copyTo(zeropoints);
+            }
+
+            if (is1D)
+            {
+                MatShape inputShape = shape(inputs[0]);
+                broadcastScaleAndZeropoint(scalesMat, zeropointsMat, scales, zeropoints, inputShape, axis, block_size);
+            }
+        }
+
+        if (outputs[0].depth() != CV_8S)
+            outputs[0].convertTo(outputs[0], CV_8S);
+    }

    void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
    {
@ -169,14 +278,13 @@ public:
        inputs_arr.getMatVector(inputs);
        outputs_arr.getMatVector(outputs);

-        if (outputs[0].depth() != CV_8S)
-            outputs[0].convertTo(outputs[0], CV_8S);
+        processInputOutput(inputs, outputs);

        if (is1D)
        {
            Mat inputTmp;
            divide(inputs[0], scalesMat, inputTmp);
-            subtract(inputTmp, zeropointsMat, inputTmp);
+            add(inputTmp, zeropointsMat, inputTmp);

            inputTmp.convertTo(outputs[0], CV_8S);
        }
@ -200,13 +308,16 @@ class DequantizeLayerImpl CV_FINAL : public DequantizeLayer
 {
 public:
    int axis;
+    int block_size;
    bool is1D;
    Mat scalesMat, zeropointsMat; // Saving the broadcasetd scales data.
+    bool quantParamExternal = true;

    DequantizeLayerImpl(const LayerParams& params)
    {
        is1D = params.get<bool>("is1D", false);
        axis = params.get<int>("axis", 1);
+        block_size = params.get<int>("block_size", 0);

        if (!is1D)
        {
@ -218,7 +329,7 @@ public:
            DictValue paramScales = params.get("scales");
            int i, n = paramScales.size();

-            CV_Assert(n > 0);
+            CV_CheckGT(n, 0, "Scale missing.");
            scales.resize(n);
            for (i = 0; i < n; i++)
                scales[i] = paramScales.get<float>(i);
@ -244,7 +355,7 @@ public:
                         std::vector<MatShape> &outputs,
                         std::vector<MatShape> &internals) const CV_OVERRIDE
    {
-        CV_Assert(inputs.size() == 1);
+        CV_Check(inputs.size(), inputs.size() >= 1 && inputs.size() <= 3, "Number of inputs must be between 1 and 3 inclusive.");
        Layer::getMemoryShapes(inputs, requiredOutputs, outputs, internals);
        return false;
    }
@ -273,7 +384,7 @@ public:
        if (is1D)
        {
            MatShape inputShape = shape(inputs[0]);
-            broadcastScaleAndZeropoint(scalesMat, zeropointsMat, scales, zeropoints, inputShape, axis);
+            broadcastScaleAndZeropoint(scalesMat, zeropointsMat, scales, zeropoints, inputShape, axis, block_size);
        }
    }

@ -292,6 +403,39 @@ public:
    }
 #endif

+    void processInputOutput(std::vector<Mat>& inputs, std::vector<Mat>& outputs)
+    {
+        CV_Check(inputs.size(), inputs.size() >= 1 && inputs.size() <= 3, "Number of inputs must be between 1 and 3 inclusive.");
+
+        quantParamExternal &= inputs.size() > 1;
+        // Scale and zeropoint taken as input
+        if (quantParamExternal)
+        {
+            quantParamExternal = false;
+            scalesMat = inputs[1];
+
+            scalesMat.reshape(1, 1).copyTo(scales);
+
+            if(scalesMat.total() > 1) is1D = true;
+
+            if (inputs.size() > 2)
+            {
+                zeropointsMat = inputs[2];
+                CV_CheckEQ((int)zeropointsMat.total(), (int)scalesMat.total(), "Scale and zero point elements number must match.");
+                zeropointsMat.reshape(1, 1).copyTo(zeropoints);
+            }
+
+            if (is1D)
+            {
+                MatShape inputShape = shape(inputs[0]);
+                broadcastScaleAndZeropoint(scalesMat, zeropointsMat, scales, zeropoints, inputShape, axis, block_size);
+            }
+        }
+
+        if (outputs[0].depth() != CV_32F)
+            outputs[0].convertTo(outputs[0], CV_32F);
+    }
+
    void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
    {
        CV_TRACE_FUNCTION();
@ -304,8 +448,7 @@ public:
        inputs_arr.getMatVector(inputs);
        outputs_arr.getMatVector(outputs);

-        if (outputs[0].depth() != CV_32F)
-            outputs[0].convertTo(outputs[0], CV_32F);
+        processInputOutput(inputs, outputs);

        if (is1D)
        {
--- a/modules/dnn/src/layers/elementwise_layers.cpp
+++ b/modules/dnn/src/layers/elementwise_layers.cpp
@ -803,7 +803,7 @@ struct GeluFunctor : public BaseFunctor {
 #endif

 #ifdef HAVE_DNN_NGRAPH
-    std::shared_ptr<ngraph::Node> initNgraphAPI(const ngraph::Output<ngraph::Node>& node)
+    std::shared_ptr<ov::Node> initNgraphAPI(const ov::Output<ov::Node>& node)
    {
        return std::make_shared<ov::op::v0::Gelu>(node);
    }
--- a/modules/dnn/src/layers/nary_eltwise_layers.cpp
+++ b/modules/dnn/src/layers/nary_eltwise_layers.cpp
@ -1057,7 +1057,7 @@ public:
        // In case only one input
        if (inputs.size() == 1) {
            auto &ieInpNode = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
-            ngraph::OutputVector inp{ieInpNode};
+            ov::OutputVector inp{ieInpNode};
            auto blank = std::make_shared<ov::op::v0::Concat>(inp, 0);
            return Ptr<BackendNode>(new InfEngineNgraphNode(blank));
        }
--- a/modules/dnn/src/onnx/onnx_importer.cpp
+++ b/modules/dnn/src/onnx/onnx_importer.cpp
@ -3280,6 +3280,17 @@ void ONNXImporter::parseQuantDequant(LayerParams& layerParams, const opencv_onnx
    // or 1-D tensor (per-channel quantized).
    bool is1D = false;

+    if (layerParams.type == "Quantize")
+        layerParams.set("depth", CV_8S);
+    else // Dequantize
+        layerParams.set("depth", CV_32F);
+
+    // If scale is not defined as a constant blob, it is considered an external input.
+    if(constBlobs.find(node_proto.input(1)) == constBlobs.end()){
+        addLayer(layerParams, node_proto);
+        return;
+    }
+
    Mat scaleMat = getBlob(node_proto, 1);
    if(scaleMat.total() > 1) is1D = true;

@ -3321,11 +3332,6 @@ void ONNXImporter::parseQuantDequant(LayerParams& layerParams, const opencv_onnx
        layerParams.set("zeropoints", zeropoint);
    }

-    if (layerParams.type == "Quantize")
-        layerParams.set("depth", CV_8S);
-    else // Dequantize
-        layerParams.set("depth", CV_32F);
-
    if (constBlobs.find(node_proto.input(0)) != constBlobs.end()) // Variable input.
    {
        std::vector<Mat> inputs, outputs;
--- a/modules/dnn/test/test_onnx_conformance.cpp
+++ b/modules/dnn/test/test_onnx_conformance.cpp
@ -224,6 +224,7 @@ static const TestCase testConformanceConfig[] = {
    {"test_depthtospace_example", 1, 1},
    {"test_dequantizelinear", 3, 1},
    {"test_dequantizelinear_axis", 3, 1},
+    {"test_dequantizelinear_blocked", 3, 1},
    {"test_det_2d", 1, 1},
    {"test_det_nd", 1, 1},
    {"test_div", 2, 1},
@ -569,6 +570,7 @@ static const TestCase testConformanceConfig[] = {
    {"test_qlinearmatmul_3D", 8, 1},
    {"test_quantizelinear", 3, 1},
    {"test_quantizelinear_axis", 3, 1},
+    {"test_quantizelinear_blocked", 3, 1},
    {"test_range_float_type_positive_delta", 3, 1},
    {"test_range_float_type_positive_delta_expanded", 3, 1},
    {"test_range_int32_type_negative_delta", 3, 1},
--- a/modules/dnn/test/test_onnx_conformance_layer_filter__openvino.inl.hpp
+++ b/modules/dnn/test/test_onnx_conformance_layer_filter__openvino.inl.hpp
@ -501,9 +501,11 @@ CASE(test_depthtospace_dcr_mode)
 CASE(test_depthtospace_example)
    // no filter
 CASE(test_dequantizelinear)
-    // no filter
+    SKIP;
 CASE(test_dequantizelinear_axis)
-    // no filter
+    SKIP;
+CASE(test_dequantizelinear_blocked)
+    SKIP;
 CASE(test_det_2d)
    // no filter
 CASE(test_det_nd)
@ -1280,9 +1282,11 @@ CASE(test_qlinearmatmul_2D)
 CASE(test_qlinearmatmul_3D)
    // no filter
 CASE(test_quantizelinear)
-    // no filter
+    SKIP;
 CASE(test_quantizelinear_axis)
-    // no filter
+    SKIP;
+CASE(test_quantizelinear_blocked)
+    SKIP;
 CASE(test_range_float_type_positive_delta)
    // no filter
 CASE(test_range_float_type_positive_delta_expanded)
--- a/modules/dnn/test/test_onnx_conformance_layer_filter__vulkan_denylist.inl.hpp
+++ b/modules/dnn/test/test_onnx_conformance_layer_filter__vulkan_denylist.inl.hpp
@ -48,6 +48,9 @@
 "test_cumsum_2d_axis_1",
 "test_cumsum_2d_negative_axis",
 "test_concat_1d_axis_negative_1",
+"test_dequantizelinear",
+"test_dequantizelinear_axis",
+"test_dequantizelinear_blocked",
 "test_div_uint8",
 "test_flatten_axis0",
 "test_flatten_axis2",
@ -71,6 +74,9 @@
 "test_pow_types_float32_int32", // vulkan backend does not take tensor other than float32 data type
 "test_pow_types_float32_int64", // vulkan backend does not take tensor other than float32 data type
 "test_pow_types_int", // vulkan backend does not take tensor other than float32 data type
+"test_quantizelinear",
+"test_quantizelinear_axis",
+"test_quantizelinear_blocked",
 "test_softmax_default_axis",
 "test_sub_bcast",
 "test_sub_uint8",
--- a/modules/dnn/test/test_onnx_conformance_layer_filter_opencv_cpu_denylist.inl.hpp
+++ b/modules/dnn/test/test_onnx_conformance_layer_filter_opencv_cpu_denylist.inl.hpp
@ -1 +0,0 @@
-
--- a/modules/dnn/test/test_onnx_conformance_layer_filter_opencv_denylist.inl.hpp
+++ b/modules/dnn/test/test_onnx_conformance_layer_filter_opencv_denylist.inl.hpp
@ -0,0 +1,4 @@
+"test_dequantizelinear_blocked", // Issue https://github.com/opencv/opencv/issues/25999
+"test_quantizelinear", // Issue https://github.com/opencv/opencv/issues/25999
+"test_quantizelinear_axis", // Issue https://github.com/opencv/opencv/issues/25999
+"test_quantizelinear_blocked", // Issue https://github.com/opencv/opencv/issues/25999
--- a/modules/dnn/test/test_onnx_conformance_layer_filter_opencv_ocl_fp16_denylist.inl.hpp
+++ b/modules/dnn/test/test_onnx_conformance_layer_filter_opencv_ocl_fp16_denylist.inl.hpp
@ -1,4 +1,7 @@
 "test_averagepool_3d_default",
+"test_dequantizelinear",
+"test_dequantizelinear_axis",
+"test_dequantizelinear_blocked",
 "test_dropout_default_ratio",
 "test_globalmaxpool",
 "test_globalmaxpool_precomputed",
@ -14,7 +17,10 @@
 "test_maxpool_2d_same_upper",
 "test_maxpool_2d_strides",
 "test_maxpool_3d_default",
-"test_pow", // fp16 accuracy issue
+"test_pow",
+"test_quantizelinear",
+"test_quantizelinear_axis",
+"test_quantizelinear_blocked",
 "test_softmax_large_number",
 "test_softmax_large_number_expanded",
 "test_split_equal_parts_1d",
--- a/modules/dnn/test/test_onnx_conformance_layer_filter_opencv_ocl_fp32_denylist.inl.hpp
+++ b/modules/dnn/test/test_onnx_conformance_layer_filter_opencv_ocl_fp32_denylist.inl.hpp
@ -1,5 +1,11 @@
 "test_averagepool_3d_default",
+"test_dequantizelinear",
+"test_dequantizelinear_axis",
+"test_dequantizelinear_blocked",
 "test_maxpool_3d_default",
+"test_quantizelinear",
+"test_quantizelinear_axis",
+"test_quantizelinear_blocked",
 "test_scatter_elements_with_axis",
 "test_scatter_elements_with_duplicate_indices",
 "test_scatter_elements_with_negative_indices",
--- a/modules/dnn/test/test_onnx_conformance_layer_parser_denylist.inl.hpp
+++ b/modules/dnn/test/test_onnx_conformance_layer_parser_denylist.inl.hpp
@ -80,8 +80,6 @@
 "test_convtranspose_pad", // Issue::Parser::Weights are required as inputs
 "test_convtranspose_pads", // Issue::Parser::Weights are required as inputs
 "test_convtranspose_with_kernel", // Issue::Parser::Weights are required as inputs
-"test_dequantizelinear", // Issue::Parser::Weights are required as inputs
-"test_dequantizelinear_axis", // Issue::Parser::Weights are required as inputs
 "test_det_2d", // Issue:: Unkonwn error
 "test_det_nd", // Issue:: Unkonwn error
 "test_dropout_default_mask", // Issue::cvtest::norm::wrong data type
@ -235,8 +233,6 @@
 "test_qlinearconv", // Issue::Parser:  Blob x_scale not found in const blobs in function 'getBlob' (weights are required as inputs)
 "test_qlinearmatmul_2D", // Issue:: Parser: Variable weights is not supported in function 'parseQMatMul'
 "test_qlinearmatmul_3D", // ---- same as above ---
-"test_quantizelinear", // Issue::Parser: Blob y_scale not found in const blobs in function 'getBlob' (weights are required as inputs)
-"test_quantizelinear_axis", // ---- same as above ---
 "test_range_float_type_positive_delta", // Issue:: Unsupported data type in function. Unsupported type in function 'parseCast'
 "test_range_float_type_positive_delta_expanded", // ---- same as above ---
 "test_range_int32_type_negative_delta", // Issue:: Unsupported data type: INT32 in function. Unsupported type in function 'parseCast'
--- a/modules/features2d/misc/java/test/ORBFeatureDetectorTest.java
+++ b/modules/features2d/misc/java/test/ORBFeatureDetectorTest.java
@ -1,5 +1,13 @@
 package org.opencv.test.features2d;

+import org.junit.Assert;
+import org.opencv.core.CvType;
+import org.opencv.core.KeyPoint;
+import org.opencv.core.Mat;
+import org.opencv.core.MatOfKeyPoint;
+import org.opencv.core.Scalar;
+import org.opencv.features2d.Features2d;
+import org.opencv.features2d.ORB;
 import org.opencv.test.OpenCVTestCase;

 public class ORBFeatureDetectorTest extends OpenCVTestCase {
@ -36,4 +44,35 @@ public class ORBFeatureDetectorTest extends OpenCVTestCase {
        fail("Not yet implemented");
    }

+    public void testDetectTwoPoints() {
+        Mat img = new Mat(256,256, CvType.CV_8UC3, new Scalar(0,0,0));
+        img.put(35, 40, 255,255, 255);
+        img.put(152, 98, 200,0, 0);
+
+        MatOfKeyPoint keypoints = new MatOfKeyPoint();
+        ORB orb = ORB.create();
+        Mat descriptors = new Mat();
+        orb.detectAndCompute(img, new Mat(), keypoints, descriptors);
+
+        KeyPoint[] keypointsArray = keypoints.toArray();
+        assertEquals(2, keypointsArray.length);
+
+        long x1 = Math.round(keypointsArray[0].pt.x);
+        long y1 = Math.round(keypointsArray[0].pt.y);
+        long x2 = Math.round(keypointsArray[1].pt.x);
+        long y2 = Math.round(keypointsArray[1].pt.y);
+
+        if (x2 > x1) {
+            assertEquals(40, x1);
+            assertEquals(35, y1);
+            assertEquals(98, x2);
+            assertEquals(152, y2);
+        } else {
+            assertEquals(40, x2);
+            assertEquals(35, y2);
+            assertEquals(98, x1);
+            assertEquals(152, y1);
+        }
+    }
+
 }
--- a/modules/imgcodecs/src/bitstrm.cpp
+++ b/modules/imgcodecs/src/bitstrm.cpp
@ -377,26 +377,30 @@ void  WBaseStream::allocate()
 }


-void  WBaseStream::writeBlock()
+bool  WBaseStream::writeBlock()
 {
    int size = (int)(m_current - m_start);

    CV_Assert(isOpened());
    if( size == 0 )
-        return;
+        return true;

    if( m_buf )
    {
        size_t sz = m_buf->size();
        m_buf->resize( sz + size );
        memcpy( &(*m_buf)[sz], m_start, size );
+        m_current = m_start;
+        m_block_pos += size;
+        return true;
    }
    else
    {
-        fwrite( m_start, 1, size, m_file );
+        size_t written = fwrite( m_start, 1, size, m_file );
+        m_current = m_start;
+        m_block_pos += size;
+        return written == (size_t)size;
    }
-    m_current = m_start;
-    m_block_pos += size;
 }


@ -463,15 +467,17 @@ WLByteStream::~WLByteStream()
 {
 }

-void WLByteStream::putByte( int val )
+bool  WLByteStream::putByte( int val )
 {
    *m_current++ = (uchar)val;
    if( m_current >= m_end )
-        writeBlock();
+        return writeBlock();
+
+    return true;
 }


-void WLByteStream::putBytes( const void* buffer, int count )
+bool  WLByteStream::putBytes( const void* buffer, int count )
 {
    uchar* data = (uchar*)buffer;

@ -492,12 +498,18 @@ void WLByteStream::putBytes( const void* buffer, int count )
            count -= l;
        }
        if( m_current == m_end )
-            writeBlock();
+        {
+            bool written = writeBlock();
+            if (!written)
+                return false;
+        }
    }
+
+    return true;
 }


-void WLByteStream::putWord( int val )
+bool  WLByteStream::putWord( int val )
 {
    uchar *current = m_current;

@ -507,17 +519,19 @@ void WLByteStream::putWord( int val )
        current[1] = (uchar)(val >> 8);
        m_current = current + 2;
        if( m_current == m_end )
-            writeBlock();
+            return writeBlock();
    }
    else
    {
        putByte(val);
        putByte(val >> 8);
    }
+
+    return true;
 }


-void WLByteStream::putDWord( int val )
+bool  WLByteStream::putDWord( int val )
 {
    uchar *current = m_current;

@ -529,7 +543,7 @@ void WLByteStream::putDWord( int val )
        current[3] = (uchar)(val >> 24);
        m_current = current + 4;
        if( m_current == m_end )
-            writeBlock();
+            return writeBlock();
    }
    else
    {
@ -538,6 +552,8 @@ void WLByteStream::putDWord( int val )
        putByte(val >> 16);
        putByte(val >> 24);
    }
+
+    return true;
 }


@ -548,7 +564,7 @@ WMByteStream::~WMByteStream()
 }


-void WMByteStream::putWord( int val )
+bool  WMByteStream::putWord( int val )
 {
    uchar *current = m_current;

@ -558,17 +574,19 @@ void WMByteStream::putWord( int val )
        current[1] = (uchar)val;
        m_current = current + 2;
        if( m_current == m_end )
-            writeBlock();
+            return writeBlock();
    }
    else
    {
        putByte(val >> 8);
        putByte(val);
    }
+
+    return true;
 }


-void WMByteStream::putDWord( int val )
+bool  WMByteStream::putDWord( int val )
 {
    uchar *current = m_current;

@ -580,7 +598,7 @@ void WMByteStream::putDWord( int val )
        current[3] = (uchar)val;
        m_current = current + 4;
        if( m_current == m_end )
-            writeBlock();
+            return writeBlock();
    }
    else
    {
@ -589,6 +607,8 @@ void WMByteStream::putDWord( int val )
        putByte(val >> 8);
        putByte(val);
    }
+
+    return true;
 }

 }
--- a/modules/imgcodecs/src/bitstrm.hpp
+++ b/modules/imgcodecs/src/bitstrm.hpp
@ -63,6 +63,12 @@ DECLARE_RBS_EXCEPTION(THROW_FORB)
 DECLARE_RBS_EXCEPTION(BAD_HEADER)
 #define RBS_BAD_HEADER RBS_BAD_HEADER_Exception(cv::Error::StsError, "Invalid header", CV_Func, __FILE__, __LINE__)

+#define CHECK_WRITE(action) \
+if (!action) \
+{ \
+    return false; \
+}
+
 typedef unsigned long ulong;

 // class RBaseStream - base class for other reading streams.
@ -147,7 +153,7 @@ protected:
    bool    m_is_opened;
    std::vector<uchar>* m_buf;

-    virtual void  writeBlock();
+    virtual bool  writeBlock();
    virtual void  release();
    virtual void  allocate();
 };
@ -160,10 +166,10 @@ class WLByteStream : public WBaseStream
 public:
    virtual ~WLByteStream();

-    void  putByte( int val );
-    void  putBytes( const void* buffer, int count );
-    void  putWord( int val );
-    void  putDWord( int val );
+    bool putByte( int val );
+    bool putBytes( const void* buffer, int count );
+    bool putWord( int val );
+    bool putDWord( int val );
 };


@ -173,8 +179,8 @@ class WMByteStream : public WLByteStream
 {
 public:
    virtual ~WMByteStream();
-    void  putWord( int val );
-    void  putDWord( int val );
+    bool putWord( int val );
+    bool putDWord( int val );
 };

 inline unsigned BSWAP(unsigned v)
--- a/modules/imgcodecs/src/grfmt_bmp.cpp
+++ b/modules/imgcodecs/src/grfmt_bmp.cpp
@ -635,38 +635,40 @@ bool  BmpEncoder::write( const Mat& img, const std::vector<int>& )
        m_buf->reserve( alignSize(fileSize + 16, 256) );

    // write signature 'BM'
-    strm.putBytes( fmtSignBmp, (int)strlen(fmtSignBmp) );
+    CHECK_WRITE(strm.putBytes( fmtSignBmp, (int)strlen(fmtSignBmp) ));

    // write file header
-    strm.putDWord( validateToInt(fileSize) ); // file size
-    strm.putDWord( 0 );
-    strm.putDWord( headerSize );
+    CHECK_WRITE(strm.putDWord( validateToInt(fileSize) )); // file size
+    CHECK_WRITE(strm.putDWord( 0 ));
+    CHECK_WRITE(strm.putDWord( headerSize ));

    // write bitmap header
-    strm.putDWord( bitmapHeaderSize );
-    strm.putDWord( width );
-    strm.putDWord( height );
-    strm.putWord( 1 );
-    strm.putWord( channels << 3 );
-    strm.putDWord( BMP_RGB );
-    strm.putDWord( 0 );
-    strm.putDWord( 0 );
-    strm.putDWord( 0 );
-    strm.putDWord( 0 );
-    strm.putDWord( 0 );
+    CHECK_WRITE(strm.putDWord( bitmapHeaderSize ));
+    CHECK_WRITE(strm.putDWord( width ));
+    CHECK_WRITE(strm.putDWord( height ));
+    CHECK_WRITE(strm.putWord( 1 ));
+    CHECK_WRITE(strm.putWord( channels << 3 ));
+    CHECK_WRITE(strm.putDWord( BMP_RGB ));
+    CHECK_WRITE(strm.putDWord( 0 ));
+    CHECK_WRITE(strm.putDWord( 0 ));
+    CHECK_WRITE(strm.putDWord( 0 ));
+    CHECK_WRITE(strm.putDWord( 0 ));
+    CHECK_WRITE(strm.putDWord( 0 ));

    if( channels == 1 )
    {
        FillGrayPalette( palette, 8 );
-        strm.putBytes( palette, sizeof(palette));
+        CHECK_WRITE(strm.putBytes( palette, sizeof(palette)));
    }

    width *= channels;
    for( int y = height - 1; y >= 0; y-- )
    {
-        strm.putBytes( img.ptr(y), width );
+        CHECK_WRITE(strm.putBytes( img.ptr(y), width ));
        if( fileStep > width )
-            strm.putBytes( zeropad, fileStep - width );
+        {
+            CHECK_WRITE(strm.putBytes( zeropad, fileStep - width ));
+        }
    }

    strm.close();
--- a/modules/imgcodecs/src/grfmt_pfm.cpp
+++ b/modules/imgcodecs/src/grfmt_pfm.cpp
@ -64,11 +64,11 @@ T read_number(cv::RLByteStream& strm)
  return atoT<T>(str);
 }

-template<typename T> void write_anything(cv::WLByteStream& strm, const T& t)
+template<typename T> bool write_anything(cv::WLByteStream& strm, const T& t)
 {
  std::ostringstream ss;
  ss << t;
-  strm.putBytes(ss.str().c_str(), static_cast<int>(ss.str().size()));
+  return strm.putBytes(ss.str().c_str(), static_cast<int>(ss.str().size()));
 }

 }
@ -206,33 +206,33 @@ bool PFMEncoder::write(const Mat& img, const std::vector<int>& params)
  }

  Mat float_img;
-  strm.putByte('P');
+  CHECK_WRITE(strm.putByte('P'));
  switch (img.channels()) {
  case 1:
-    strm.putByte('f');
+    CHECK_WRITE(strm.putByte('f'));
    img.convertTo(float_img, CV_32FC1);
    break;
  case 3:
-    strm.putByte('F');
+    CHECK_WRITE(strm.putByte('F'));
    img.convertTo(float_img, CV_32FC3);
    break;
  default:
    CV_Error(Error::StsBadArg, "Expected 1 or 3 channel image.");
  }
-  strm.putByte('\n');
+  CHECK_WRITE(strm.putByte('\n'));


-  write_anything(strm, float_img.cols);
-  strm.putByte(' ');
-  write_anything(strm, float_img.rows);
-  strm.putByte('\n');
+  CHECK_WRITE(write_anything(strm, float_img.cols));
+  CHECK_WRITE(strm.putByte(' '));
+  CHECK_WRITE(write_anything(strm, float_img.rows));
+  CHECK_WRITE(strm.putByte('\n'));
 #ifdef WORDS_BIGENDIAN
-  write_anything(strm, 1.0);
+  CHECK_WRITE(write_anything(strm, 1.0));
 #else
-  write_anything(strm, -1.0);
+  CHECK_WRITE(write_anything(strm, -1.0));
 #endif

-  strm.putByte('\n');
+  CHECK_WRITE(strm.putByte('\n'));

  // Comments are not officially supported in this file format.
  // write_anything(strm, "# Generated by OpenCV " CV_VERSION "\n");
@ -248,17 +248,15 @@ bool PFMEncoder::write(const Mat& img, const std::vector<int>& params)
        rgb_row[x*3+1] = bgr_row[x*3+1];
        rgb_row[x*3+2] = bgr_row[x*3+0];
      }
-      strm.putBytes( reinterpret_cast<const uchar*>(rgb_row.data()),
-                     static_cast<int>(sizeof(float) * row_size) );
+      CHECK_WRITE(strm.putBytes( reinterpret_cast<const uchar*>(rgb_row.data()),
+                     static_cast<int>(sizeof(float) * row_size) ));
    } else if (float_img.channels() == 1) {
-      strm.putBytes(float_img.ptr(y), sizeof(float) * float_img.cols);
+      CHECK_WRITE(strm.putBytes(float_img.ptr(y), sizeof(float) * float_img.cols));
    }
  }
  return true;
 }

-
 }

-
 #endif // HAVE_IMGCODEC_PFM
--- a/modules/imgcodecs/src/grfmt_pxm.cpp
+++ b/modules/imgcodecs/src/grfmt_pxm.cpp
@ -479,7 +479,7 @@ bool PxMEncoder::write(const Mat& img, const std::vector<int>& params)
        header_sz += sz;
    }

-    strm.putBytes(buffer, header_sz);
+    CHECK_WRITE(strm.putBytes(buffer, header_sz));

    for( y = 0; y < height; y++ )
    {
@ -512,7 +512,7 @@ bool PxMEncoder::write(const Mat& img, const std::vector<int>& params)
                {
                    *ptr++ = byte;
                }
-                strm.putBytes(buffer, (int)(ptr - buffer));
+                CHECK_WRITE(strm.putBytes(buffer, (int)(ptr - buffer)));
                continue;
            }

@ -539,7 +539,7 @@ bool PxMEncoder::write(const Mat& img, const std::vector<int>& params)
                }
            }

-            strm.putBytes( (channels > 1 || depth > 8) ? buffer : (const char*)data, fileStep);
+            CHECK_WRITE(strm.putBytes( (channels > 1 || depth > 8) ? buffer : (const char*)data, fileStep));
        }
        else
        {
@ -610,7 +610,7 @@ bool PxMEncoder::write(const Mat& img, const std::vector<int>& params)

            *ptr++ = '\n';

-            strm.putBytes( buffer, (int)(ptr - buffer) );
+            CHECK_WRITE(strm.putBytes( buffer, (int)(ptr - buffer) ));
        }
    }

--- a/modules/imgcodecs/src/grfmt_sunras.cpp
+++ b/modules/imgcodecs/src/grfmt_sunras.cpp
@ -410,17 +410,17 @@ bool  SunRasterEncoder::write( const Mat& img, const std::vector<int>& )

    if( strm.open(m_filename) )
    {
-        strm.putBytes( fmtSignSunRas, (int)strlen(fmtSignSunRas) );
-        strm.putDWord( width );
-        strm.putDWord( height );
-        strm.putDWord( channels*8 );
-        strm.putDWord( fileStep*height );
-        strm.putDWord( RAS_STANDARD );
-        strm.putDWord( RMT_NONE );
-        strm.putDWord( 0 );
+        CHECK_WRITE(strm.putBytes( fmtSignSunRas, (int)strlen(fmtSignSunRas) ));
+        CHECK_WRITE(strm.putDWord( width ));
+        CHECK_WRITE(strm.putDWord( height ));
+        CHECK_WRITE(strm.putDWord( channels*8 ));
+        CHECK_WRITE(strm.putDWord( fileStep*height ));
+        CHECK_WRITE(strm.putDWord( RAS_STANDARD ));
+        CHECK_WRITE(strm.putDWord( RMT_NONE ));
+        CHECK_WRITE(strm.putDWord( 0 ));

        for( y = 0; y < height; y++ )
-            strm.putBytes( img.ptr(y), fileStep );
+            CHECK_WRITE(strm.putBytes( img.ptr(y), fileStep ));

        strm.close();
        result = true;
--- a/modules/imgcodecs/src/grfmt_tiff.cpp
+++ b/modules/imgcodecs/src/grfmt_tiff.cpp
@ -1100,16 +1100,6 @@ bool TiffEncoder::isFormatSupported( int depth ) const
    return depth == CV_8U || depth == CV_8S || depth == CV_16U || depth == CV_16S || depth == CV_32S || depth == CV_32F || depth == CV_64F;
 }

-void  TiffEncoder::writeTag( WLByteStream& strm, TiffTag tag,
-                             TiffFieldType fieldType,
-                             int count, int value )
-{
-    strm.putWord( tag );
-    strm.putWord( fieldType );
-    strm.putDWord( count );
-    strm.putDWord( value );
-}
-
 class TiffEncoderBufHelper
 {
 public:
--- a/modules/imgcodecs/src/grfmt_tiff.hpp
+++ b/modules/imgcodecs/src/grfmt_tiff.hpp
@ -132,10 +132,6 @@ public:
    ImageEncoder newEncoder() const CV_OVERRIDE;

 protected:
-    void  writeTag( WLByteStream& strm, TiffTag tag,
-                    TiffFieldType fieldType,
-                    int count, int value );
-
    bool writeLibTiff( const std::vector<Mat>& img_vec, const std::vector<int>& params );
    bool write_32FC3_SGILOG(const Mat& img, void* tif);

--- a/modules/imgproc/include/opencv2/imgproc.hpp
+++ b/modules/imgproc/include/opencv2/imgproc.hpp
@ -3771,10 +3771,11 @@ floating-point.
@param code color space conversion code (see #ColorConversionCodes).
@param dstCn number of channels in the destination image; if the parameter is 0, the number of the
 channels is derived automatically from src and code.
+@param hint Implementation modfication flags. See #AlgorithmHint

@see @ref imgproc_color_conversions
 */
-CV_EXPORTS_W void cvtColor( InputArray src, OutputArray dst, int code, int dstCn = 0 );
+CV_EXPORTS_W void cvtColor( InputArray src, OutputArray dst, int code, int dstCn = 0, AlgorithmHint hint = cv::ALGO_HINT_DEFAULT );

 /** @brief Converts an image from one color space to another where the source image is
 stored in two planes.
@ -3793,8 +3794,9 @@ This function only supports YUV420 to RGB conversion as of now.
 - #COLOR_YUV2RGB_NV21
 - #COLOR_YUV2BGRA_NV21
 - #COLOR_YUV2RGBA_NV21
+@param hint Implementation modfication flags. See #AlgorithmHint
 */
-CV_EXPORTS_W void cvtColorTwoPlane( InputArray src1, InputArray src2, OutputArray dst, int code );
+CV_EXPORTS_W void cvtColorTwoPlane( InputArray src1, InputArray src2, OutputArray dst, int code, AlgorithmHint hint = cv::ALGO_HINT_DEFAULT );

 /** @brief main function for all demosaicing processes

--- a/modules/imgproc/include/opencv2/imgproc/hal/hal.hpp
+++ b/modules/imgproc/include/opencv2/imgproc/hal/hal.hpp
@ -108,11 +108,19 @@ CV_EXPORTS void warpAffine(int src_type,
                           uchar * dst_data, size_t dst_step, int dst_width, int dst_height,
                           const double M[6], int interpolation, int borderType, const double borderValue[4]);

+CV_EXPORTS void warpAffineBlocklineNN(int *adelta, int *bdelta, short* xy, int X0, int Y0, int bw);
+
+CV_EXPORTS void warpAffineBlockline(int *adelta, int *bdelta, short* xy, short* alpha, int X0, int Y0, int bw);
+
 CV_EXPORTS void warpPerspective(int src_type,
                               const uchar * src_data, size_t src_step, int src_width, int src_height,
                               uchar * dst_data, size_t dst_step, int dst_width, int dst_height,
                               const double M[9], int interpolation, int borderType, const double borderValue[4]);

+CV_EXPORTS void warpPerspectiveBlocklineNN(const double *M, short* xy, double X0, double Y0, double W0, int bw);
+
+CV_EXPORTS void warpPerspectiveBlockline(const double *M, short* xy, short* alpha, double X0, double Y0, double W0, int bw);
+
 CV_EXPORTS void cvtBGRtoBGR(const uchar * src_data, size_t src_step,
                            uchar * dst_data, size_t dst_step,
                            int width, int height,
--- a/modules/imgproc/include/opencv2/imgproc/hal/interface.h
+++ b/modules/imgproc/include/opencv2/imgproc/hal/interface.h
@ -12,6 +12,12 @@
 #define CV_HAL_INTER_CUBIC 2
 #define CV_HAL_INTER_AREA 3
 #define CV_HAL_INTER_LANCZOS4 4
+#define CV_HAL_INTER_LINEAR_EXACT 5
+#define CV_HAL_INTER_NEAREST_EXACT 6
+#define CV_HAL_INTER_MAX 7
+#define CV_HAL_WARP_FILL_OUTLIERS 8
+#define CV_HAL_WARP_INVERSE_MAP 16
+#define CV_HAL_WARP_RELATIVE_MAP 32
 //! @}

 //! @name Morphology operations
--- a/modules/imgproc/src/color.cpp
+++ b/modules/imgproc/src/color.cpp
@ -168,7 +168,7 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )

 // helper function for dual-plane modes

-void cvtColorTwoPlane( InputArray _ysrc, InputArray _uvsrc, OutputArray _dst, int code )
+void cvtColorTwoPlane( InputArray _ysrc, InputArray _uvsrc, OutputArray _dst, int code, AlgorithmHint hint )
 {
    // only YUV420 is currently supported
    switch (code)
@ -181,7 +181,7 @@ void cvtColorTwoPlane( InputArray _ysrc, InputArray _uvsrc, OutputArray _dst, in
            return;
    }

-    cvtColorTwoPlaneYUV2BGRpair(_ysrc, _uvsrc, _dst, dstChannels(code), swapBlue(code), uIndex(code));
+    cvtColorTwoPlaneYUV2BGRpair(_ysrc, _uvsrc, _dst, hint, dstChannels(code), swapBlue(code), uIndex(code));
 }


@ -189,10 +189,13 @@ void cvtColorTwoPlane( InputArray _ysrc, InputArray _uvsrc, OutputArray _dst, in
 //                                   The main function                                  //
 //////////////////////////////////////////////////////////////////////////////////////////

-void cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
+void cvtColor( InputArray _src, OutputArray _dst, int code, int dcn, AlgorithmHint hint)
 {
    CV_INSTRUMENT_REGION();

+    if (hint == cv::ALGO_HINT_DEFAULT)
+        hint = cv::getDefaultAlgorithmHint();
+
    CV_Assert(!_src.empty());

    if(dcn <= 0)
@ -244,12 +247,12 @@ void cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )

        case COLOR_BGR2YCrCb: case COLOR_RGB2YCrCb:
        case COLOR_BGR2YUV:   case COLOR_RGB2YUV:
-            cvtColorBGR2YUV(_src, _dst, swapBlue(code), code == COLOR_BGR2YCrCb || code == COLOR_RGB2YCrCb);
+            cvtColorBGR2YUV(_src, _dst, hint, swapBlue(code), code == COLOR_BGR2YCrCb || code == COLOR_RGB2YCrCb);
            break;

        case COLOR_YCrCb2BGR: case COLOR_YCrCb2RGB:
        case COLOR_YUV2BGR:   case COLOR_YUV2RGB:
-            cvtColorYUV2BGR(_src, _dst, dcn, swapBlue(code), code == COLOR_YCrCb2BGR || code == COLOR_YCrCb2RGB);
+            cvtColorYUV2BGR(_src, _dst, hint, dcn, swapBlue(code), code == COLOR_YCrCb2BGR || code == COLOR_YCrCb2RGB);
            break;

        case COLOR_BGR2XYZ:
@ -321,14 +324,14 @@ void cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
        case COLOR_YUV2BGRA_NV21: case COLOR_YUV2RGBA_NV21: case COLOR_YUV2BGRA_NV12: case COLOR_YUV2RGBA_NV12:
            // http://www.fourcc.org/yuv.php#NV21 == yuv420sp -> a plane of 8 bit Y samples followed by an interleaved V/U plane containing 8 bit 2x2 subsampled chroma samples
            // http://www.fourcc.org/yuv.php#NV12 -> a plane of 8 bit Y samples followed by an interleaved U/V plane containing 8 bit 2x2 subsampled colour difference samples
-            cvtColorTwoPlaneYUV2BGR(_src, _dst, dcn, swapBlue(code), uIndex(code));
+            cvtColorTwoPlaneYUV2BGR(_src, _dst, hint, dcn, swapBlue(code), uIndex(code));
            break;

        case COLOR_YUV2BGR_YV12: case COLOR_YUV2RGB_YV12: case COLOR_YUV2BGRA_YV12: case COLOR_YUV2RGBA_YV12:
        case COLOR_YUV2BGR_IYUV: case COLOR_YUV2RGB_IYUV: case COLOR_YUV2BGRA_IYUV: case COLOR_YUV2RGBA_IYUV:
            //http://www.fourcc.org/yuv.php#YV12 == yuv420p -> It comprises an NxM Y plane followed by (N/2)x(M/2) V and U planes.
            //http://www.fourcc.org/yuv.php#IYUV == I420 -> It comprises an NxN Y plane followed by (N/2)x(N/2) U and V planes
-            cvtColorThreePlaneYUV2BGR(_src, _dst, dcn, swapBlue(code), uIndex(code));
+            cvtColorThreePlaneYUV2BGR(_src, _dst, hint, dcn, swapBlue(code), uIndex(code));
            break;

        case COLOR_YUV2GRAY_420:
@ -337,7 +340,7 @@ void cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )

        case COLOR_RGB2YUV_YV12: case COLOR_BGR2YUV_YV12: case COLOR_RGBA2YUV_YV12: case COLOR_BGRA2YUV_YV12:
        case COLOR_RGB2YUV_IYUV: case COLOR_BGR2YUV_IYUV: case COLOR_RGBA2YUV_IYUV: case COLOR_BGRA2YUV_IYUV:
-            cvtColorBGR2ThreePlaneYUV(_src, _dst, swapBlue(code), uIndex(code));
+            cvtColorBGR2ThreePlaneYUV(_src, _dst, hint, swapBlue(code), uIndex(code));
            break;

        case COLOR_YUV2RGB_UYVY: case COLOR_YUV2BGR_UYVY: case COLOR_YUV2RGBA_UYVY: case COLOR_YUV2BGRA_UYVY:
@ -349,7 +352,7 @@ void cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
            {
                int ycn  = (code==COLOR_YUV2RGB_UYVY || code==COLOR_YUV2BGR_UYVY ||
                            code==COLOR_YUV2RGBA_UYVY || code==COLOR_YUV2BGRA_UYVY) ? 1 : 0;
-                cvtColorOnePlaneYUV2BGR(_src, _dst, dcn, swapBlue(code), uIndex(code), ycn);
+                cvtColorOnePlaneYUV2BGR(_src, _dst, hint, dcn, swapBlue(code), uIndex(code), ycn);
                break;
            }

@ -362,7 +365,7 @@ void cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
            {
                int ycn  = (code==COLOR_RGB2YUV_UYVY ||  code==COLOR_BGR2YUV_UYVY ||
                            code==COLOR_RGBA2YUV_UYVY || code==COLOR_BGRA2YUV_UYVY) ? 1 : 0;
-                cvtColorOnePlaneBGR2YUV(_src, _dst, swapBlue(code), uIndex(code), ycn);
+                cvtColorOnePlaneBGR2YUV(_src, _dst, hint, swapBlue(code), uIndex(code), ycn);
                break;
            }

--- a/modules/imgproc/src/color.hpp
+++ b/modules/imgproc/src/color.hpp
@ -556,15 +556,15 @@ void cvtColorLuv2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb, bo
 void cvtColorBGR2XYZ( InputArray _src, OutputArray _dst, bool swapb );
 void cvtColorXYZ2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb );

-void cvtColorBGR2YUV( InputArray _src, OutputArray _dst, bool swapb, bool crcb);
-void cvtColorYUV2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb, bool crcb);
-
-void cvtColorOnePlaneYUV2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb, int uidx, int ycn);
-void cvtColorOnePlaneBGR2YUV( InputArray _src, OutputArray _dst, bool swapb, int uidx, int ycn);
-void cvtColorTwoPlaneYUV2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb, int uidx );
-void cvtColorTwoPlaneYUV2BGRpair( InputArray _ysrc, InputArray _uvsrc, OutputArray _dst, int dcn, bool swapb, int uidx );
-void cvtColorThreePlaneYUV2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb, int uidx );
-void cvtColorBGR2ThreePlaneYUV( InputArray _src, OutputArray _dst, bool swapb, int uidx);
+void cvtColorBGR2YUV( InputArray _src, OutputArray _dst, AlgorithmHint hint, bool swapb, bool crcb);
+void cvtColorYUV2BGR( InputArray _src, OutputArray _dst, AlgorithmHint hint, int dcn, bool swapb, bool crcb);
+
+void cvtColorOnePlaneYUV2BGR( InputArray _src, OutputArray _dst, AlgorithmHint hint, int dcn, bool swapb, int uidx, int ycn );
+void cvtColorOnePlaneBGR2YUV( InputArray _src, OutputArray _dst, AlgorithmHint hint, bool swapb, int uidx, int ycn );
+void cvtColorTwoPlaneYUV2BGR( InputArray _src, OutputArray _dst, AlgorithmHint hint, int dcn, bool swapb, int uidx );
+void cvtColorTwoPlaneYUV2BGRpair( InputArray _ysrc, InputArray _uvsrc, OutputArray _dst, AlgorithmHint hint, int dcn, bool swapb, int uidx );
+void cvtColorThreePlaneYUV2BGR( InputArray _src, OutputArray _dst, AlgorithmHint hint, int dcn, bool swapb, int uidx );
+void cvtColorBGR2ThreePlaneYUV( InputArray _src, OutputArray _dst, AlgorithmHint hint, bool swapb, int uidx );
 void cvtColorYUV2Gray_420( InputArray _src, OutputArray _dst );
 void cvtColorYUV2Gray_ch( InputArray _src, OutputArray _dst, int coi );

--- a/modules/imgproc/src/color_yuv.dispatch.cpp
+++ b/modules/imgproc/src/color_yuv.dispatch.cpp
@ -18,13 +18,18 @@ namespace cv {
 namespace hal {

 // 8u, 16u, 32f
-void cvtBGRtoYUV(const uchar * src_data, size_t src_step,
+static void cvtBGRtoYUV(const uchar * src_data, size_t src_step,
                 uchar * dst_data, size_t dst_step,
                 int width, int height,
-                 int depth, int scn, bool swapBlue, bool isCbCr)
+                 int depth, int scn, bool swapBlue, bool isCbCr, AlgorithmHint hint)
 {
    CV_INSTRUMENT_REGION();

+    if (hint == ALGO_HINT_APPROX)
+    {
+        CALL_HAL(cvtBGRtoYUV, cv_hal_cvtBGRtoYUVApprox, src_data, src_step, dst_data, dst_step, width, height, depth, scn, swapBlue, isCbCr);
+    }
+
    CALL_HAL(cvtBGRtoYUV, cv_hal_cvtBGRtoYUV, src_data, src_step, dst_data, dst_step, width, height, depth, scn, swapBlue, isCbCr);

 #if defined(HAVE_IPP)
@ -66,13 +71,18 @@ void cvtBGRtoYUV(const uchar * src_data, size_t src_step,
        CV_CPU_DISPATCH_MODES_ALL);
 }

-void cvtYUVtoBGR(const uchar * src_data, size_t src_step,
+static void cvtYUVtoBGR(const uchar * src_data, size_t src_step,
                 uchar * dst_data, size_t dst_step,
                 int width, int height,
-                 int depth, int dcn, bool swapBlue, bool isCbCr)
+                 int depth, int dcn, bool swapBlue, bool isCbCr, AlgorithmHint hint)
 {
    CV_INSTRUMENT_REGION();

+    if (hint == ALGO_HINT_APPROX)
+    {
+        CALL_HAL(cvtYUVtoBGR, cv_hal_cvtYUVtoBGRApprox, src_data, src_step, dst_data, dst_step, width, height, depth, dcn, swapBlue, isCbCr);
+    }
+
    CALL_HAL(cvtYUVtoBGR, cv_hal_cvtYUVtoBGR, src_data, src_step, dst_data, dst_step, width, height, depth, dcn, swapBlue, isCbCr);


@ -115,63 +125,79 @@ void cvtYUVtoBGR(const uchar * src_data, size_t src_step,
        CV_CPU_DISPATCH_MODES_ALL);
 }

-// 4:2:0, two planes in one array: Y, UV interleaved
+// 4:2:0, two planes: Y, UV interleaved
 // Y : [16, 235]; Cb, Cr: [16, 240] centered at 128
 // 20-bit fixed-point arithmetics
-void cvtTwoPlaneYUVtoBGR(const uchar * src_data, size_t src_step,
+static void cvtTwoPlaneYUVtoBGR(const uchar * y_data, size_t y_step, const uchar * uv_data, size_t uv_step,
                         uchar * dst_data, size_t dst_step,
                         int dst_width, int dst_height,
-                         int dcn, bool swapBlue, int uIdx)
+                         int dcn, bool swapBlue, int uIdx, AlgorithmHint hint)
 {
    CV_INSTRUMENT_REGION();

-    CALL_HAL(cvtTwoPlaneYUVtoBGR, cv_hal_cvtTwoPlaneYUVtoBGR, src_data, src_step, dst_data, dst_step, dst_width, dst_height, dcn, swapBlue, uIdx);
+    if (hint == ALGO_HINT_APPROX)
+    {
+            CALL_HAL(cvtTwoPlaneYUVtoBGREx, cv_hal_cvtTwoPlaneYUVtoBGRExApprox,
+                y_data, y_step, uv_data, uv_step, dst_data, dst_step, dst_width, dst_height, dcn, swapBlue, uIdx);
+    }

-    cvtTwoPlaneYUVtoBGR(
-            src_data, src_step, src_data + src_step * dst_height, src_step, dst_data, dst_step,
-            dst_width, dst_height, dcn, swapBlue, uIdx);
+    CALL_HAL(cvtTwoPlaneYUVtoBGREx, cv_hal_cvtTwoPlaneYUVtoBGREx,
+             y_data, y_step, uv_data, uv_step, dst_data, dst_step, dst_width, dst_height, dcn, swapBlue, uIdx);
+
+    CV_CPU_DISPATCH(cvtTwoPlaneYUVtoBGR, (y_data, y_step, uv_data, uv_step, dst_data, dst_step, dst_width, dst_height, dcn, swapBlue, uIdx),
+        CV_CPU_DISPATCH_MODES_ALL);
 }

-// 4:2:0, two planes: Y, UV interleaved
+// 4:2:0, two planes in one array: Y, UV interleaved
 // Y : [16, 235]; Cb, Cr: [16, 240] centered at 128
 // 20-bit fixed-point arithmetics
-void cvtTwoPlaneYUVtoBGR(const uchar * y_data, const uchar * uv_data, size_t src_step,
+static void cvtTwoPlaneYUVtoBGR(const uchar * src_data, size_t src_step,
                         uchar * dst_data, size_t dst_step,
                         int dst_width, int dst_height,
-                         int dcn, bool swapBlue, int uIdx)
+                         int dcn, bool swapBlue, int uIdx, AlgorithmHint hint)
 {
    CV_INSTRUMENT_REGION();

-    cvtTwoPlaneYUVtoBGR(y_data, src_step, uv_data, src_step, dst_data, dst_step, dst_width, dst_height, dcn, swapBlue, uIdx);
+    if (hint == ALGO_HINT_APPROX)
+    {
+        CALL_HAL(cvtTwoPlaneYUVtoBGR, cv_hal_cvtTwoPlaneYUVtoBGRApprox, src_data, src_step, dst_data, dst_step, dst_width, dst_height, dcn, swapBlue, uIdx);
+    }
+
+    CALL_HAL(cvtTwoPlaneYUVtoBGR, cv_hal_cvtTwoPlaneYUVtoBGR, src_data, src_step, dst_data, dst_step, dst_width, dst_height, dcn, swapBlue, uIdx);
+
+    cvtTwoPlaneYUVtoBGR(
+            src_data, src_step, src_data + src_step * dst_height, src_step, dst_data, dst_step,
+            dst_width, dst_height, dcn, swapBlue, uIdx, hint);
 }

 // 4:2:0, two planes: Y, UV interleaved
 // Y : [16, 235]; Cb, Cr: [16, 240] centered at 128
 // 20-bit fixed-point arithmetics
-void cvtTwoPlaneYUVtoBGR(const uchar * y_data, size_t y_step, const uchar * uv_data, size_t uv_step,
+static void cvtTwoPlaneYUVtoBGR(const uchar * y_data, const uchar * uv_data, size_t src_step,
                         uchar * dst_data, size_t dst_step,
                         int dst_width, int dst_height,
-                         int dcn, bool swapBlue, int uIdx)
+                         int dcn, bool swapBlue, int uIdx, AlgorithmHint hint)
 {
    CV_INSTRUMENT_REGION();

-    CALL_HAL(cvtTwoPlaneYUVtoBGREx, cv_hal_cvtTwoPlaneYUVtoBGREx,
-             y_data, y_step, uv_data, uv_step, dst_data, dst_step, dst_width, dst_height, dcn, swapBlue, uIdx);
-
-    CV_CPU_DISPATCH(cvtTwoPlaneYUVtoBGR, (y_data, y_step, uv_data, uv_step, dst_data, dst_step, dst_width, dst_height, dcn, swapBlue, uIdx),
-        CV_CPU_DISPATCH_MODES_ALL);
+    cvtTwoPlaneYUVtoBGR(y_data, src_step, uv_data, src_step, dst_data, dst_step, dst_width, dst_height, dcn, swapBlue, uIdx, hint);
 }

 // 4:2:0, three planes in one array: Y, U, V
 // Y : [16, 235]; Cb, Cr: [16, 240] centered at 128
 // 20-bit fixed-point arithmetics
-void cvtThreePlaneYUVtoBGR(const uchar * src_data, size_t src_step,
+static void cvtThreePlaneYUVtoBGR(const uchar * src_data, size_t src_step,
                           uchar * dst_data, size_t dst_step,
                           int dst_width, int dst_height,
-                           int dcn, bool swapBlue, int uIdx)
+                           int dcn, bool swapBlue, int uIdx, AlgorithmHint hint)
 {
    CV_INSTRUMENT_REGION();

+    if (hint == ALGO_HINT_APPROX)
+    {
+        CALL_HAL(cvtThreePlaneYUVtoBGR, cv_hal_cvtThreePlaneYUVtoBGRApprox, src_data, src_step, dst_data, dst_step, dst_width, dst_height, dcn, swapBlue, uIdx);
+    }
+
    CALL_HAL(cvtThreePlaneYUVtoBGR, cv_hal_cvtThreePlaneYUVtoBGR, src_data, src_step, dst_data, dst_step, dst_width, dst_height, dcn, swapBlue, uIdx);

    CV_CPU_DISPATCH(cvtThreePlaneYUVtoBGR, (src_data, src_step, dst_data, dst_step, dst_width, dst_height, dcn, swapBlue, uIdx),
@ -181,46 +207,39 @@ void cvtThreePlaneYUVtoBGR(const uchar * src_data, size_t src_step,
 // 4:2:0, three planes in one array: Y, U, V
 // Y : [16, 235]; Cb, Cr: [16, 240] centered at 128
 // 20-bit fixed-point arithmetics
-void cvtBGRtoThreePlaneYUV(const uchar * src_data, size_t src_step,
+static void cvtBGRtoThreePlaneYUV(const uchar * src_data, size_t src_step,
                           uchar * dst_data, size_t dst_step,
                           int width, int height,
-                           int scn, bool swapBlue, int uIdx)
+                           int scn, bool swapBlue, int uIdx, AlgorithmHint hint)
 {
    CV_INSTRUMENT_REGION();

+    if (hint == ALGO_HINT_APPROX)
+    {
+        CALL_HAL(cvtBGRtoThreePlaneYUV, cv_hal_cvtBGRtoThreePlaneYUVApprox, src_data, src_step, dst_data, dst_step, width, height, scn, swapBlue, uIdx);
+    }
+
    CALL_HAL(cvtBGRtoThreePlaneYUV, cv_hal_cvtBGRtoThreePlaneYUV, src_data, src_step, dst_data, dst_step, width, height, scn, swapBlue, uIdx);

    CV_CPU_DISPATCH(cvtBGRtoThreePlaneYUV, (src_data, src_step, dst_data, dst_step, width, height, scn, swapBlue, uIdx),
        CV_CPU_DISPATCH_MODES_ALL);
 }

-// 4:2:0, two planes: Y, UV interleaved
-// Y : [16, 235]; Cb, Cr: [16, 240] centered at 128
-// 20-bit fixed-point arithmetics
-void cvtBGRtoTwoPlaneYUV(const uchar * src_data, size_t src_step,
-                         uchar * y_data, uchar * uv_data, size_t dst_step,
-                         int width, int height,
-                         int scn, bool swapBlue, int uIdx)
-{
-    CV_INSTRUMENT_REGION();
-
-    CALL_HAL(cvtBGRtoTwoPlaneYUV, cv_hal_cvtBGRtoTwoPlaneYUV,
-             src_data, src_step, y_data, dst_step, uv_data, dst_step, width, height, scn, swapBlue, uIdx);
-
-    CV_CPU_DISPATCH(cvtBGRtoTwoPlaneYUV, (src_data, src_step, y_data, uv_data, dst_step, width, height, scn, swapBlue, uIdx),
-        CV_CPU_DISPATCH_MODES_ALL);
-}
-
 // 4:2:2 interleaved
 // Y : [16, 235]; Cb, Cr: [16, 240] centered at 128
 // 20-bit fixed-point arithmetics
-void cvtOnePlaneYUVtoBGR(const uchar * src_data, size_t src_step,
+static void cvtOnePlaneYUVtoBGR(const uchar * src_data, size_t src_step,
                         uchar * dst_data, size_t dst_step,
                         int width, int height,
-                         int dcn, bool swapBlue, int uIdx, int ycn)
+                         int dcn, bool swapBlue, int uIdx, int ycn, AlgorithmHint hint)
 {
    CV_INSTRUMENT_REGION();

+    if (hint == ALGO_HINT_APPROX)
+    {
+        CALL_HAL(cvtOnePlaneYUVtoBGR, cv_hal_cvtOnePlaneYUVtoBGRApprox, src_data, src_step, dst_data, dst_step, width, height, dcn, swapBlue, uIdx, ycn);
+    }
+
    CALL_HAL(cvtOnePlaneYUVtoBGR, cv_hal_cvtOnePlaneYUVtoBGR, src_data, src_step, dst_data, dst_step, width, height, dcn, swapBlue, uIdx, ycn);

    CV_CPU_DISPATCH(cvtOnePlaneYUVtoBGR, (src_data, src_step, dst_data, dst_step, width, height, dcn, swapBlue, uIdx, ycn),
@ -230,13 +249,18 @@ void cvtOnePlaneYUVtoBGR(const uchar * src_data, size_t src_step,
 // 4:2:2 interleaved
 // Y : [16, 235]; Cb, Cr: [16, 240] centered at 128
 // 14-bit fixed-point arithmetics is used
-void cvtOnePlaneBGRtoYUV(const uchar * src_data, size_t src_step,
+static void cvtOnePlaneBGRtoYUV(const uchar * src_data, size_t src_step,
                         uchar * dst_data, size_t dst_step,
                         int width, int height,
-                         int scn, bool swapBlue, int uIdx, int ycn)
+                         int scn, bool swapBlue, int uIdx, int ycn, AlgorithmHint hint)
 {
    CV_INSTRUMENT_REGION();

+    if (hint == ALGO_HINT_APPROX)
+    {
+        CALL_HAL(cvtOnePlaneBGRtoYUV, cv_hal_cvtOnePlaneBGRtoYUVApprox, src_data, src_step, dst_data, dst_step, width, height, scn, swapBlue, uIdx, ycn);
+    }
+
    CALL_HAL(cvtOnePlaneBGRtoYUV, cv_hal_cvtOnePlaneBGRtoYUV, src_data, src_step, dst_data, dst_step, width, height, scn, swapBlue, uIdx, ycn);

    CV_CPU_DISPATCH(cvtOnePlaneBGRtoYUV, (src_data, src_step, dst_data, dst_step, width, height, scn, swapBlue, uIdx, ycn),
@ -386,43 +410,43 @@ bool oclCvtColorBGR2ThreePlaneYUV( InputArray _src, OutputArray _dst, int bidx,
 // HAL calls
 //

-void cvtColorBGR2YUV(InputArray _src, OutputArray _dst, bool swapb, bool crcb)
+void cvtColorBGR2YUV(InputArray _src, OutputArray _dst, AlgorithmHint hint, bool swapb, bool crcb)
 {
    CvtHelper< Set<3, 4>, Set<3>, Set<CV_8U, CV_16U, CV_32F> > h(_src, _dst, 3);

    hal::cvtBGRtoYUV(h.src.data, h.src.step, h.dst.data, h.dst.step, h.src.cols, h.src.rows,
-                     h.depth, h.scn, swapb, crcb);
+                     h.depth, h.scn, swapb, crcb, hint);
 }

-void cvtColorYUV2BGR(InputArray _src, OutputArray _dst, int dcn, bool swapb, bool crcb)
+void cvtColorYUV2BGR(InputArray _src, OutputArray _dst, AlgorithmHint hint, int dcn, bool swapb, bool crcb)
 {
    if(dcn <= 0) dcn = 3;
    CvtHelper< Set<3>, Set<3, 4>, Set<CV_8U, CV_16U, CV_32F> > h(_src, _dst, dcn);

    hal::cvtYUVtoBGR(h.src.data, h.src.step, h.dst.data, h.dst.step, h.src.cols, h.src.rows,
-                     h.depth, dcn, swapb, crcb);
+                     h.depth, dcn, swapb, crcb, hint);
 }

 // 4:2:2 interleaved
 // Y : [16, 235]; Cb, Cr: [16, 240] centered at 128
 // 20-bit fixed-point arithmetics
-void cvtColorOnePlaneYUV2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb, int uidx, int ycn)
+void cvtColorOnePlaneYUV2BGR( InputArray _src, OutputArray _dst, AlgorithmHint hint, int dcn, bool swapb, int uidx, int ycn)
 {
    CvtHelper< Set<2>, Set<3, 4>, Set<CV_8U>, FROM_UYVY > h(_src, _dst, dcn);

    hal::cvtOnePlaneYUVtoBGR(h.src.data, h.src.step, h.dst.data, h.dst.step, h.src.cols, h.src.rows,
-                             dcn, swapb, uidx, ycn);
+                             dcn, swapb, uidx, ycn, hint);
 }

 // 4:2:2 interleaved
 // Y : [16, 235]; Cb, Cr: [16, 240] centered at 128
 // 14-bit fixed-point arithmetics is used
-void cvtColorOnePlaneBGR2YUV( InputArray _src, OutputArray _dst, bool swapb, int uidx, int ycn)
+void cvtColorOnePlaneBGR2YUV( InputArray _src, OutputArray _dst, AlgorithmHint hint, bool swapb, int uidx, int ycn)
 {
    CvtHelper< Set<3, 4>, Set<2>, Set<CV_8U>, TO_UYVY > h(_src, _dst, 2);

    hal::cvtOnePlaneBGRtoYUV(h.src.data, h.src.step, h.dst.data, h.dst.step, h.src.cols, h.src.rows,
-                             h.scn, swapb, uidx, ycn);
+                             h.scn, swapb, uidx, ycn, hint);
 }

 void cvtColorYUV2Gray_ch( InputArray _src, OutputArray _dst, int coi )
@ -435,12 +459,12 @@ void cvtColorYUV2Gray_ch( InputArray _src, OutputArray _dst, int coi )
 // 4:2:0, three planes in one array: Y, U, V
 // Y : [16, 235]; Cb, Cr: [16, 240] centered at 128
 // 20-bit fixed-point arithmetics
-void cvtColorBGR2ThreePlaneYUV( InputArray _src, OutputArray _dst, bool swapb, int uidx)
+void cvtColorBGR2ThreePlaneYUV( InputArray _src, OutputArray _dst, AlgorithmHint hint, bool swapb, int uidx)
 {
    CvtHelper< Set<3, 4>, Set<1>, Set<CV_8U>, TO_YUV > h(_src, _dst, 1);

    hal::cvtBGRtoThreePlaneYUV(h.src.data, h.src.step, h.dst.data, h.dst.step, h.src.cols, h.src.rows,
-                               h.scn, swapb, uidx);
+                               h.scn, swapb, uidx, hint);
 }

 void cvtColorYUV2Gray_420( InputArray _src, OutputArray _dst )
@ -460,32 +484,32 @@ void cvtColorYUV2Gray_420( InputArray _src, OutputArray _dst )
 // 4:2:0, three planes in one array: Y, U, V
 // Y : [16, 235]; Cb, Cr: [16, 240] centered at 128
 // 20-bit fixed-point arithmetics
-void cvtColorThreePlaneYUV2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb, int uidx)
+void cvtColorThreePlaneYUV2BGR( InputArray _src, OutputArray _dst, AlgorithmHint hint, int dcn, bool swapb, int uidx)
 {
    if(dcn <= 0) dcn = 3;
    CvtHelper< Set<1>, Set<3, 4>, Set<CV_8U>, FROM_YUV> h(_src, _dst, dcn);

    hal::cvtThreePlaneYUVtoBGR(h.src.data, h.src.step, h.dst.data, h.dst.step, h.dst.cols, h.dst.rows,
-                               dcn, swapb, uidx);
+                               dcn, swapb, uidx, hint);
 }

 // 4:2:0, two planes in one array: Y, UV interleaved
 // Y : [16, 235]; Cb, Cr: [16, 240] centered at 128
 // 20-bit fixed-point arithmetics
 // see also: http://www.fourcc.org/yuv.php#NV21, http://www.fourcc.org/yuv.php#NV12
-void cvtColorTwoPlaneYUV2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb, int uidx )
+void cvtColorTwoPlaneYUV2BGR( InputArray _src, OutputArray _dst, AlgorithmHint hint, int dcn, bool swapb, int uidx )
 {
    if(dcn <= 0) dcn = 3;
    CvtHelper< Set<1>, Set<3, 4>, Set<CV_8U>, FROM_YUV> h(_src, _dst, dcn);

    hal::cvtTwoPlaneYUVtoBGR(h.src.data, h.src.step, h.dst.data, h.dst.step, h.dst.cols, h.dst.rows,
-                             dcn, swapb, uidx);
+                             dcn, swapb, uidx, hint);
 }

 // 4:2:0, two planes: Y, UV interleaved
 // Y : [16, 235]; Cb, Cr: [16, 240] centered at 128
 // 20-bit fixed-point arithmetics
-void cvtColorTwoPlaneYUV2BGRpair( InputArray _ysrc, InputArray _uvsrc, OutputArray _dst, int dcn, bool swapb, int uidx )
+void cvtColorTwoPlaneYUV2BGRpair( InputArray _ysrc, InputArray _uvsrc, OutputArray _dst, AlgorithmHint hint, int dcn, bool swapb, int uidx )
 {
    int stype = _ysrc.type();
    int depth = CV_MAT_DEPTH(stype);
@ -503,13 +527,13 @@ void cvtColorTwoPlaneYUV2BGRpair( InputArray _ysrc, InputArray _uvsrc, OutputArr
    {
        hal::cvtTwoPlaneYUVtoBGR(ysrc.data, uvsrc.data, ysrc.step,
                                 dst.data, dst.step, dst.cols, dst.rows,
-                                 dcn, swapb, uidx);
+                                 dcn, swapb, uidx, hint);
    }
    else
    {
        hal::cvtTwoPlaneYUVtoBGR(ysrc.data, ysrc.step, uvsrc.data, uvsrc.step,
                                dst.data, dst.step, dst.cols, dst.rows,
-                                dcn, swapb, uidx);
+                                dcn, swapb, uidx, hint);
    }
 }

--- a/modules/imgproc/src/hal_replacement.hpp
+++ b/modules/imgproc/src/hal_replacement.hpp
@ -273,6 +273,29 @@ inline int hal_ni_resize(int src_type, const uchar *src_data, size_t src_step, i
   @sa cv::warpAffine, cv::hal::warpAffine
 */
 inline int hal_ni_warpAffine(int src_type, const uchar *src_data, size_t src_step, int src_width, int src_height, uchar *dst_data, size_t dst_step, int dst_width, int dst_height, const double M[6], int interpolation, int borderType, const double borderValue[4]) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+/**
+   @brief hal_warpAffineBlocklineNN doing a row of affine transformation
+   @param adelta input M0 * x array
+   @param bdelta input M3 * x array
+   @param xy output (x', y') coordinates
+   @param X0 input M1 * y + M2 value
+   @param Y0 input M4 * y + M5 value
+   @param bw length of the row
+   @sa cv::warpAffineBlocklineNN, cv::hal::warpAffineBlocklineNN
+ */
+inline int hal_ni_warpAffineBlocklineNN(int *adelta, int *bdelta, short* xy, int X0, int Y0, int bw) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+/**
+   @brief hal_warpAffineBlockline doing a row of affine transformation
+   @param adelta input M0 * x array
+   @param bdelta input M3 * x array
+   @param xy output (x', y') coordinates
+   @param alpha output least significant bits of the (x', y') coordinates for interpolation
+   @param X0 input M1 * y + M2 value
+   @param Y0 input M4 * y + M5 value
+   @param bw length of the row
+   @sa cv::warpAffineBlockline, cv::hal::warpAffineBlockline
+ */
+inline int hal_ni_warpAffineBlockline(int *adelta, int *bdelta, short* xy, short* alpha, int X0, int Y0, int bw) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
 /**
   @brief hal_warpPerspective
   @param src_type source and destination image type
@ -291,11 +314,38 @@ inline int hal_ni_warpAffine(int src_type, const uchar *src_data, size_t src_ste
   @sa cv::warpPerspective, cv::hal::warpPerspective
 */
 inline int hal_ni_warpPerspective(int src_type, const uchar *src_data, size_t src_step, int src_width, int src_height, uchar *dst_data, size_t dst_step, int dst_width, int dst_height, const double M[9], int interpolation, int borderType, const double borderValue[4]) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+/**
+   @brief hal_warpPerspectiveBlocklineNN doing a row of perspective transformation
+   @param M 3x3 matrix with transform coefficients
+   @param xy output (x', y') coordinates
+   @param X0 input M0 * x0 + M1 * y + M2 value
+   @param Y0 input M3 * x0 + M4 * y + M5 value
+   @param W0 input M6 * x0 + M7 * y + M8 value
+   @param bw length of the row
+   @sa cv::warpPerspectiveBlocklineNN, cv::hal::warpPerspectiveBlocklineNN
+ */
+inline int hal_ni_warpPerspectiveBlocklineNN(const double *M, short* xy, double X0, double Y0, double W0, int bw) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+/**
+   @brief hal_warpPerspectiveBlockline doing a row of perspective transformation
+   @param M 3x3 matrix with transform coefficients
+   @param xy output (x', y') coordinates
+   @param alpha output least significant bits of the (x', y') coordinates for interpolation
+   @param X0 input M0 * x0 + M1 * y + M2 value
+   @param Y0 input M3 * x0 + M4 * y + M5 value
+   @param W0 input M6 * x0 + M7 * y + M8 value
+   @param bw length of the row
+   @sa cv::warpPerspectiveBlockline, cv::hal::warpPerspectiveBlockline
+ */
+inline int hal_ni_warpPerspectiveBlockline(const double *M, short* xy, short* alpha, double X0, double Y0, double W0, int bw) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }

 //! @cond IGNORED
 #define cv_hal_resize hal_ni_resize
 #define cv_hal_warpAffine hal_ni_warpAffine
+#define cv_hal_warpAffineBlocklineNN hal_ni_warpAffineBlocklineNN
+#define cv_hal_warpAffineBlockline hal_ni_warpAffineBlockline
 #define cv_hal_warpPerspective hal_ni_warpPerspective
+#define cv_hal_warpPerspectiveBlocklineNN hal_ni_warpPerspectiveBlocklineNN
+#define cv_hal_warpPerspectiveBlockline hal_ni_warpPerspectiveBlockline
 //! @endcond

 /**
@ -449,6 +499,23 @@ inline int hal_ni_cvtGraytoBGR5x5(const uchar * src_data, size_t src_step, uchar
 */
 inline int hal_ni_cvtBGRtoYUV(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int scn, bool swapBlue, bool isCbCr) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }

+/**
+   @brief Analog of hal_cvtBGRtoYUV, but allows approximations (not bit-exact)
+   @param src_data source image data
+   @param src_step source image step
+   @param dst_data destination image data
+   @param dst_step destination image step
+   @param width image width
+   @param height image height
+   @param depth image depth (one of CV_8U, CV_16U or CV_32F)
+   @param scn source image channels (3 or 4)
+   @param swapBlue if set to true B and R source channels will be swapped (treat as RGB)
+   @param isCbCr if set to true write output in YCbCr format
+   Convert from BGR, RGB, BGRA or RGBA to YUV or YCbCr.
+ */
+inline int hal_ni_cvtBGRtoYUVApprox(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int scn, bool swapBlue, bool isCbCr) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+
+
 /**
   @brief hal_cvtYUVtoBGR
   @param src_data source image data
@ -465,6 +532,22 @@ inline int hal_ni_cvtBGRtoYUV(const uchar * src_data, size_t src_step, uchar * d
 */
 inline int hal_ni_cvtYUVtoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int dcn, bool swapBlue, bool isCbCr) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }

+/**
+   @brief Analog of hal_cvtYUVtoBGR, but allows approximations (not bit-exact)
+   @param src_data source image data
+   @param src_step source image step
+   @param dst_data destination image data
+   @param dst_step destination image step
+   @param width image width
+   @param height image height
+   @param depth image depth (one of CV_8U, CV_16U or CV_32F)
+   @param dcn destination image channels (3 or 4)
+   @param swapBlue if set to true B and R destination channels will be swapped (write RGB)
+   @param isCbCr if set to true treat source as YCbCr
+   Convert from YUV or YCbCr to BGR, RGB, BGRA or RGBA.
+ */
+inline int hal_ni_cvtYUVtoBGRApprox(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int dcn, bool swapBlue, bool isCbCr) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+
 /**
   @brief hal_cvtBGRtoXYZ
   @param src_data source image data
@ -580,6 +663,24 @@ inline int hal_ni_cvtLabtoBGR(const uchar * src_data, size_t src_step, uchar * d
 */
 inline int hal_ni_cvtTwoPlaneYUVtoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int dst_width, int dst_height, int dcn, bool swapBlue, int uIdx) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }

+/**
+   @brief analog of hal_cvtTwoPlaneYUVtoBGR that allows approximations (not bit-exact)
+   @param src_data source image data
+   @param src_step source image step
+   @param dst_data destination image data
+   @param dst_step destination image step
+   @param dst_width destination image width
+   @param dst_height destination image height
+   @param dcn destination image channels (3 or 4)
+   @param swapBlue if set to true B and R destination channels will be swapped (write RGB)
+   @param uIdx U-channel index in the interleaved U/V plane (0 or 1)
+   Convert from YUV (YUV420sp (or NV12/NV21) - Y plane followed by interleaved U/V plane) to BGR, RGB, BGRA or RGBA.
+   Only for CV_8U.
+   Y : [16, 235]; Cb, Cr: [16, 240] centered at 128
+ */
+inline int hal_ni_cvtTwoPlaneYUVtoBGRApprox(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int dst_width, int dst_height, int dcn, bool swapBlue, int uIdx) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+
+
 /**
   @brief Extended version of hal_cvtTwoPlaneYUVtoBGR.
   @param y_data source image data (Y-plane)
@ -601,6 +702,27 @@ inline int hal_ni_cvtTwoPlaneYUVtoBGREx(const uchar * y_data, size_t y_step, con
                                      uchar * dst_data, size_t dst_step, int dst_width, int dst_height,
                                      int dcn, bool swapBlue, int uIdx) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }

+/**
+   @brief Extended version of hal_cvtTwoPlaneYUVtoBGR that allows approximations (not bit-exact)
+   @param y_data source image data (Y-plane)
+   @param y_step source image step (Y-plane)
+   @param uv_data source image data (UV-plane)
+   @param uv_step source image step (UV-plane)
+   @param dst_data destination image data
+   @param dst_step destination image step
+   @param dst_width destination image width
+   @param dst_height destination image height
+   @param dcn destination image channels (3 or 4)
+   @param swapBlue if set to true B and R destination channels will be swapped (write RGB)
+   @param uIdx U-channel index in the interleaved U/V plane (0 or 1)
+   Convert from YUV (YUV420sp (or NV12/NV21) - Y plane followed by interleaved U/V plane) to BGR, RGB, BGRA or RGBA.
+   Only for CV_8U.
+   Y : [16, 235]; Cb, Cr: [16, 240] centered at 128
+ */
+inline int hal_ni_cvtTwoPlaneYUVtoBGRExApprox(const uchar * y_data, size_t y_step, const uchar * uv_data, size_t uv_step,
+                                      uchar * dst_data, size_t dst_step, int dst_width, int dst_height,
+                                      int dcn, bool swapBlue, int uIdx) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+
 /**
   @brief hal_cvtBGRtoTwoPlaneYUV
   @param src_data source image data
@ -640,6 +762,23 @@ inline int hal_ni_cvtBGRtoTwoPlaneYUV(const uchar * src_data, size_t src_step,
 */
 inline int hal_ni_cvtThreePlaneYUVtoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int dst_width, int dst_height, int dcn, bool swapBlue, int uIdx) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }

+/**
+   @brief Analog of hal_cvtThreePlaneYUVtoBGR that allows approximations (not bit-exact)
+   @param src_data source image data
+   @param src_step source image step
+   @param dst_data destination image data
+   @param dst_step destination image step
+   @param dst_width destination image width
+   @param dst_height destination image height
+   @param dcn destination image channels (3 or 4)
+   @param swapBlue if set to true B and R destination channels will be swapped (write RGB)
+   @param uIdx U-channel plane index (0 or 1)
+   Convert from YUV (YUV420p (or YV12/YV21) - Y plane followed by U and V planes) to BGR, RGB, BGRA or RGBA.
+   Only for CV_8U.
+   Y : [16, 235]; Cb, Cr: [16, 240] centered at 128
+ */
+inline int hal_ni_cvtThreePlaneYUVtoBGRApprox(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int dst_width, int dst_height, int dcn, bool swapBlue, int uIdx) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+
 /**
   @brief hal_cvtBGRtoThreePlaneYUV
   @param src_data source image data
@ -657,6 +796,24 @@ inline int hal_ni_cvtThreePlaneYUVtoBGR(const uchar * src_data, size_t src_step,
 */
 inline int hal_ni_cvtBGRtoThreePlaneYUV(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int scn, bool swapBlue, int uIdx) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }

+/**
+   @brief Analog of hal_cvtBGRtoThreePlaneYUV that allows approximations (not bit-exact)
+   @param src_data source image data
+   @param src_step source image step
+   @param dst_data destination image data
+   @param dst_step destination image step
+   @param width image width
+   @param height image height
+   @param scn source image channels (3 or 4)
+   @param swapBlue if set to true B and R source channels will be swapped (treat as RGB)
+   @param uIdx U-channel plane index (0 or 1)
+   Convert from BGR, RGB, BGRA or RGBA to YUV (YUV420p (or YV12/YV21) - Y plane followed by U and V planes).
+   Only for CV_8U.
+   Y : [16, 235]; Cb, Cr: [16, 240] centered at 128
+ */
+inline int hal_ni_cvtBGRtoThreePlaneYUVApprox(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int scn, bool swapBlue, int uIdx) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+
+
 /**
   @brief hal_cvtOnePlaneYUVtoBGR
   @param src_data source image data
@ -675,6 +832,24 @@ inline int hal_ni_cvtBGRtoThreePlaneYUV(const uchar * src_data, size_t src_step,
 */
 inline int hal_ni_cvtOnePlaneYUVtoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int dcn, bool swapBlue, int uIdx, int ycn) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }

+/**
+   @brief analog of hal_cvtOnePlaneYUVtoBGR that allows approximations (not bit-exact)
+   @param src_data source image data
+   @param src_step source image step
+   @param dst_data destination image data
+   @param dst_step destination image step
+   @param width image width
+   @param height image height
+   @param dcn destination image channels (3 or 4)
+   @param swapBlue if set to true B and R destination channels will be swapped (write RGB)
+   @param uIdx U-channel index (0 or 1)
+   @param ycn Y-channel index (0 or 1)
+   Convert from interleaved YUV 4:2:2 (UYVY, YUY2 or YVYU) to BGR, RGB, BGRA or RGBA.
+   Only for CV_8U.
+   Y : [16, 235]; Cb, Cr: [16, 240] centered at 128
+ */
+inline int hal_ni_cvtOnePlaneYUVtoBGRApprox(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int dcn, bool swapBlue, int uIdx, int ycn) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+
 /**
   @brief hal_cvtOnePlaneBGRtoYUV
   @param src_data,src_step source image data and step
@ -690,6 +865,21 @@ inline int hal_ni_cvtOnePlaneYUVtoBGR(const uchar * src_data, size_t src_step, u
 */
 inline int hal_ni_cvtOnePlaneBGRtoYUV(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int scn, bool swapBlue, int uIdx, int ycn) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }

+/**
+   @brief analog of hal_cvtOnePlaneBGRtoYUV that allows approximations (not bit-exact)
+   @param src_data,src_step source image data and step
+   @param dst_data,dst_step destination image data and step
+   @param width,height image size
+   @param scn source image channels (3 or 4)
+   @param swapBlue if set to true B and R destination channels will be swapped (write RGB)
+   @param uIdx U-channel index (0 or 1)
+   @param ycn Y-channel index (0 or 1)
+   Convert from BGR, RGB, BGRA or RGBA to interleaved YUV 4:2:2 (UYVY, YUY2 or YVYU).
+   Only for CV_8U.
+   Y : [16, 235]; Cb, Cr: [16, 240] centered at 128
+ */
+inline int hal_ni_cvtOnePlaneBGRtoYUVApprox(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int scn, bool swapBlue, int uIdx, int ycn) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+
 /**
   @brief hal_cvtRGBAtoMultipliedRGBA
   @param src_data source image data
@ -725,7 +915,9 @@ inline int hal_ni_cvtMultipliedRGBAtoRGBA(const uchar * src_data, size_t src_ste
 #define cv_hal_cvtBGR5x5toGray hal_ni_cvtBGR5x5toGray
 #define cv_hal_cvtGraytoBGR5x5 hal_ni_cvtGraytoBGR5x5
 #define cv_hal_cvtBGRtoYUV hal_ni_cvtBGRtoYUV
+#define cv_hal_cvtBGRtoYUVApprox hal_ni_cvtBGRtoYUVApprox
 #define cv_hal_cvtYUVtoBGR hal_ni_cvtYUVtoBGR
+#define cv_hal_cvtYUVtoBGRApprox hal_ni_cvtYUVtoBGRApprox
 #define cv_hal_cvtBGRtoXYZ hal_ni_cvtBGRtoXYZ
 #define cv_hal_cvtXYZtoBGR hal_ni_cvtXYZtoBGR
 #define cv_hal_cvtBGRtoHSV hal_ni_cvtBGRtoHSV
@ -733,12 +925,18 @@ inline int hal_ni_cvtMultipliedRGBAtoRGBA(const uchar * src_data, size_t src_ste
 #define cv_hal_cvtBGRtoLab hal_ni_cvtBGRtoLab
 #define cv_hal_cvtLabtoBGR hal_ni_cvtLabtoBGR
 #define cv_hal_cvtTwoPlaneYUVtoBGR hal_ni_cvtTwoPlaneYUVtoBGR
+#define cv_hal_cvtTwoPlaneYUVtoBGRApprox hal_ni_cvtTwoPlaneYUVtoBGRApprox
 #define cv_hal_cvtTwoPlaneYUVtoBGREx hal_ni_cvtTwoPlaneYUVtoBGREx
+#define cv_hal_cvtTwoPlaneYUVtoBGRExApprox hal_ni_cvtTwoPlaneYUVtoBGRExApprox
 #define cv_hal_cvtBGRtoTwoPlaneYUV hal_ni_cvtBGRtoTwoPlaneYUV
 #define cv_hal_cvtThreePlaneYUVtoBGR hal_ni_cvtThreePlaneYUVtoBGR
+#define cv_hal_cvtThreePlaneYUVtoBGRApprox hal_ni_cvtThreePlaneYUVtoBGRApprox
 #define cv_hal_cvtBGRtoThreePlaneYUV hal_ni_cvtBGRtoThreePlaneYUV
+#define cv_hal_cvtBGRtoThreePlaneYUVApprox hal_ni_cvtBGRtoThreePlaneYUVApprox
 #define cv_hal_cvtOnePlaneYUVtoBGR hal_ni_cvtOnePlaneYUVtoBGR
+#define cv_hal_cvtOnePlaneYUVtoBGRApprox hal_ni_cvtOnePlaneYUVtoBGRApprox
 #define cv_hal_cvtOnePlaneBGRtoYUV hal_ni_cvtOnePlaneBGRtoYUV
+#define cv_hal_cvtOnePlaneBGRtoYUVApprox hal_ni_cvtOnePlaneBGRtoYUVApprox
 #define cv_hal_cvtRGBAtoMultipliedRGBA hal_ni_cvtRGBAtoMultipliedRGBA
 #define cv_hal_cvtMultipliedRGBAtoRGBA hal_ni_cvtMultipliedRGBAtoRGBA
 //! @endcond
--- a/modules/imgproc/src/imgwarp.cpp
+++ b/modules/imgproc/src/imgwarp.cpp
@ -2169,16 +2169,7 @@ public:
        short *XY = __XY.data(), *A = __A.data();
        const int AB_BITS = MAX(10, (int)INTER_BITS);
        const int AB_SCALE = 1 << AB_BITS;
-        int round_delta = interpolation == INTER_NEAREST ? AB_SCALE/2 : AB_SCALE/INTER_TAB_SIZE/2, x, y, x1, y1;
-    #if CV_TRY_AVX2
-        bool useAVX2 = CV_CPU_HAS_SUPPORT_AVX2;
-    #endif
-    #if CV_TRY_SSE4_1
-        bool useSSE4_1 = CV_CPU_HAS_SUPPORT_SSE4_1;
-    #endif
-    #if CV_TRY_LASX
-        bool useLASX = CV_CPU_HAS_SUPPORT_LASX;
-    #endif
+        int round_delta = interpolation == INTER_NEAREST ? AB_SCALE/2 : AB_SCALE/INTER_TAB_SIZE/2, x, y, y1;

        int bh0 = std::min(BLOCK_SZ/2, dst.rows);
        int bw0 = std::min(BLOCK_SZ*BLOCK_SZ/bh0, dst.cols);
@ -2201,84 +2192,9 @@ public:
                    int Y0 = saturate_cast<int>((M[4]*(y + y1) + M[5])*AB_SCALE) + round_delta;

                    if( interpolation == INTER_NEAREST )
-                    {
-                        x1 = 0;
-                        #if CV_TRY_SSE4_1
-                        if( useSSE4_1 )
-                            opt_SSE4_1::WarpAffineInvoker_Blockline_SSE41(adelta + x, bdelta + x, xy, X0, Y0, bw);
-                        else
-                        #endif
-                        {
-                            #if CV_SIMD128
-                            {
-                                v_int32x4 v_X0 = v_setall_s32(X0), v_Y0 = v_setall_s32(Y0);
-                                int span = VTraits<v_uint16x8>::vlanes();
-                                for( ; x1 <= bw - span; x1 += span )
-                                {
-                                    v_int16x8 v_dst[2];
-                                    #define CV_CONVERT_MAP(ptr,offset,shift) v_pack(v_shr<AB_BITS>(v_add(shift,v_load(ptr + offset))),\
-                                                                                    v_shr<AB_BITS>(v_add(shift,v_load(ptr + offset + 4))))
-                                    v_dst[0] = CV_CONVERT_MAP(adelta, x+x1, v_X0);
-                                    v_dst[1] = CV_CONVERT_MAP(bdelta, x+x1, v_Y0);
-                                    #undef CV_CONVERT_MAP
-                                    v_store_interleave(xy + (x1 << 1), v_dst[0], v_dst[1]);
-                                }
-                            }
-                            #endif
-                            for( ; x1 < bw; x1++ )
-                            {
-                                int X = (X0 + adelta[x+x1]) >> AB_BITS;
-                                int Y = (Y0 + bdelta[x+x1]) >> AB_BITS;
-                                xy[x1*2] = saturate_cast<short>(X);
-                                xy[x1*2+1] = saturate_cast<short>(Y);
-                            }
-                        }
-                    }
+                        hal::warpAffineBlocklineNN(adelta + x, bdelta + x, xy, X0, Y0, bw);
                    else
-                    {
-                        short* alpha = A + y1*bw;
-                        x1 = 0;
-                        #if CV_TRY_AVX2
-                        if ( useAVX2 )
-                            x1 = opt_AVX2::warpAffineBlockline(adelta + x, bdelta + x, xy, alpha, X0, Y0, bw);
-                        #endif
-                        #if CV_TRY_LASX
-                        if ( useLASX )
-                            x1 = opt_LASX::warpAffineBlockline(adelta + x, bdelta + x, xy, alpha, X0, Y0, bw);
-                        #endif
-                        #if CV_SIMD128
-                        {
-                            v_int32x4 v__X0 = v_setall_s32(X0), v__Y0 = v_setall_s32(Y0);
-                            v_int32x4 v_mask = v_setall_s32(INTER_TAB_SIZE - 1);
-                            int span = VTraits<v_float32x4>::vlanes();
-                            for( ; x1 <= bw - span * 2; x1 += span * 2 )
-                            {
-                                v_int32x4 v_X0 = v_shr<AB_BITS - INTER_BITS>(v_add(v__X0, v_load(this->adelta + x + x1)));
-                                v_int32x4 v_Y0 = v_shr<AB_BITS - INTER_BITS>(v_add(v__Y0, v_load(this->bdelta + x + x1)));
-                                v_int32x4 v_X1 = v_shr<AB_BITS - INTER_BITS>(v_add(v__X0, v_load(this->adelta + x + x1 + span)));
-                                v_int32x4 v_Y1 = v_shr<AB_BITS - INTER_BITS>(v_add(v__Y0, v_load(this->bdelta + x + x1 + span)));
-
-                                v_int16x8 v_xy[2];
-                                v_xy[0] = v_pack(v_shr<INTER_BITS>(v_X0), v_shr<INTER_BITS>(v_X1));
-                                v_xy[1] = v_pack(v_shr<INTER_BITS>(v_Y0), v_shr<INTER_BITS>(v_Y1));
-                                v_store_interleave(xy + (x1 << 1), v_xy[0], v_xy[1]);
-
-                                v_int32x4 v_alpha0 = v_or(v_shl<INTER_BITS>(v_and(v_Y0, v_mask)), v_and(v_X0, v_mask));
-                                v_int32x4 v_alpha1 = v_or(v_shl<INTER_BITS>(v_and(v_Y1, v_mask)), v_and(v_X1, v_mask));
-                                v_store(alpha + x1, v_pack(v_alpha0, v_alpha1));
-                            }
-                        }
-                        #endif
-                        for( ; x1 < bw; x1++ )
-                        {
-                            int X = (X0 + adelta[x+x1]) >> (AB_BITS - INTER_BITS);
-                            int Y = (Y0 + bdelta[x+x1]) >> (AB_BITS - INTER_BITS);
-                            xy[x1*2] = saturate_cast<short>(X >> INTER_BITS);
-                            xy[x1*2+1] = saturate_cast<short>(Y >> INTER_BITS);
-                            alpha[x1] = (short)((Y & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE +
-                                    (X & (INTER_TAB_SIZE-1)));
-                        }
-                    }
+                        hal::warpAffineBlockline(adelta + x, bdelta + x, xy, A + y1*bw, X0, Y0, bw);
                }

                if( interpolation == INTER_NEAREST )
@ -2703,6 +2619,97 @@ void warpAffine(int src_type,
    parallel_for_(range, invoker, dst.total()/(double)(1<<16));
 }

+void warpAffineBlocklineNN(int *adelta, int *bdelta, short* xy, int X0, int Y0, int bw)
+{
+    CALL_HAL(warpAffineBlocklineNN, cv_hal_warpAffineBlocklineNN, adelta, bdelta, xy, X0, Y0, bw);
+
+    const int AB_BITS = MAX(10, (int)INTER_BITS);
+    int x1 = 0;
+    #if CV_TRY_SSE4_1
+    bool useSSE4_1 = CV_CPU_HAS_SUPPORT_SSE4_1;
+    if( useSSE4_1 )
+        opt_SSE4_1::WarpAffineInvoker_Blockline_SSE41(adelta, bdelta, xy, X0, Y0, bw);
+    else
+    #endif
+    {
+        #if CV_SIMD128
+        {
+            v_int32x4 v_X0 = v_setall_s32(X0), v_Y0 = v_setall_s32(Y0);
+            int span = VTraits<v_uint16x8>::vlanes();
+            for( ; x1 <= bw - span; x1 += span )
+            {
+                v_int16x8 v_dst[2];
+                #define CV_CONVERT_MAP(ptr,offset,shift) v_pack(v_shr<AB_BITS>(v_add(shift,v_load(ptr + offset))),\
+                                                                v_shr<AB_BITS>(v_add(shift,v_load(ptr + offset + 4))))
+                v_dst[0] = CV_CONVERT_MAP(adelta, x1, v_X0);
+                v_dst[1] = CV_CONVERT_MAP(bdelta, x1, v_Y0);
+                #undef CV_CONVERT_MAP
+                v_store_interleave(xy + (x1 << 1), v_dst[0], v_dst[1]);
+            }
+        }
+        #endif
+        for( ; x1 < bw; x1++ )
+        {
+            int X = (X0 + adelta[x1]) >> AB_BITS;
+            int Y = (Y0 + bdelta[x1]) >> AB_BITS;
+            xy[x1*2] = saturate_cast<short>(X);
+            xy[x1*2+1] = saturate_cast<short>(Y);
+        }
+    }
+}
+
+void warpAffineBlockline(int *adelta, int *bdelta, short* xy, short* alpha, int X0, int Y0, int bw)
+{
+    CALL_HAL(warpAffineBlockline, cv_hal_warpAffineBlockline, adelta, bdelta, xy, alpha, X0, Y0, bw);
+
+    const int AB_BITS = MAX(10, (int)INTER_BITS);
+    int x1 = 0;
+    #if CV_TRY_AVX2
+    bool useAVX2 = CV_CPU_HAS_SUPPORT_AVX2;
+    if ( useAVX2 )
+        x1 = opt_AVX2::warpAffineBlockline(adelta, bdelta, xy, alpha, X0, Y0, bw);
+    #endif
+    #if CV_TRY_LASX
+    bool useLASX = CV_CPU_HAS_SUPPORT_LASX;
+    if ( useLASX )
+        x1 = opt_LASX::warpAffineBlockline(adelta, bdelta, xy, alpha, X0, Y0, bw);
+    #endif
+    {
+        #if CV_SIMD128
+        {
+            v_int32x4 v__X0 = v_setall_s32(X0), v__Y0 = v_setall_s32(Y0);
+            v_int32x4 v_mask = v_setall_s32(INTER_TAB_SIZE - 1);
+            int span = VTraits<v_float32x4>::vlanes();
+            for( ; x1 <= bw - span * 2; x1 += span * 2 )
+            {
+                v_int32x4 v_X0 = v_shr<AB_BITS - INTER_BITS>(v_add(v__X0, v_load(adelta + x1)));
+                v_int32x4 v_Y0 = v_shr<AB_BITS - INTER_BITS>(v_add(v__Y0, v_load(bdelta + x1)));
+                v_int32x4 v_X1 = v_shr<AB_BITS - INTER_BITS>(v_add(v__X0, v_load(adelta + x1 + span)));
+                v_int32x4 v_Y1 = v_shr<AB_BITS - INTER_BITS>(v_add(v__Y0, v_load(bdelta + x1 + span)));
+
+                v_int16x8 v_xy[2];
+                v_xy[0] = v_pack(v_shr<INTER_BITS>(v_X0), v_shr<INTER_BITS>(v_X1));
+                v_xy[1] = v_pack(v_shr<INTER_BITS>(v_Y0), v_shr<INTER_BITS>(v_Y1));
+                v_store_interleave(xy + (x1 << 1), v_xy[0], v_xy[1]);
+
+                v_int32x4 v_alpha0 = v_or(v_shl<INTER_BITS>(v_and(v_Y0, v_mask)), v_and(v_X0, v_mask));
+                v_int32x4 v_alpha1 = v_or(v_shl<INTER_BITS>(v_and(v_Y1, v_mask)), v_and(v_X1, v_mask));
+                v_store(alpha + x1, v_pack(v_alpha0, v_alpha1));
+            }
+        }
+        #endif
+        for( ; x1 < bw; x1++ )
+        {
+            int X = (X0 + adelta[x1]) >> (AB_BITS - INTER_BITS);
+            int Y = (Y0 + bdelta[x1]) >> (AB_BITS - INTER_BITS);
+            xy[x1*2] = saturate_cast<short>(X >> INTER_BITS);
+            xy[x1*2+1] = saturate_cast<short>(Y >> INTER_BITS);
+            alpha[x1] = (short)((Y & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE +
+                    (X & (INTER_TAB_SIZE-1)));
+        }
+    }
+}
+
 } // hal::
 } // cv::

@ -3105,12 +3112,6 @@ public:
        int bw0 = std::min(BLOCK_SZ*BLOCK_SZ/bh0, width);
        bh0 = std::min(BLOCK_SZ*BLOCK_SZ/bw0, height);

-        #if CV_TRY_SSE4_1
-        Ptr<opt_SSE4_1::WarpPerspectiveLine_SSE4> pwarp_impl_sse4;
-        if(CV_CPU_HAS_SUPPORT_SSE4_1)
-            pwarp_impl_sse4 = opt_SSE4_1::WarpPerspectiveLine_SSE4::getImpl(M);
-        #endif
-
        for( y = range.start; y < range.end; y += bh0 )
        {
            for( x = 0; x < width; x += bw0 )
@ -3129,57 +3130,9 @@ public:
                    double W0 = M[6]*x + M[7]*(y + y1) + M[8];

                    if( interpolation == INTER_NEAREST )
-                    {
-                        #if CV_TRY_SSE4_1
-                        if (pwarp_impl_sse4)
-                            pwarp_impl_sse4->processNN(M, xy, X0, Y0, W0, bw);
-                        else
-                        #endif
-                        #if CV_SIMD128_64F
-                        WarpPerspectiveLine_ProcessNN_CV_SIMD(M, xy, X0, Y0, W0, bw);
-                        #else
-                        for( int x1 = 0; x1 < bw; x1++ )
-                        {
-                            double W = W0 + M[6]*x1;
-                            W = W ? 1./W : 0;
-                            double fX = std::max((double)INT_MIN, std::min((double)INT_MAX, (X0 + M[0]*x1)*W));
-                            double fY = std::max((double)INT_MIN, std::min((double)INT_MAX, (Y0 + M[3]*x1)*W));
-                            int X = saturate_cast<int>(fX);
-                            int Y = saturate_cast<int>(fY);
-
-                            xy[x1*2] = saturate_cast<short>(X);
-                            xy[x1*2+1] = saturate_cast<short>(Y);
-                        }
-                        #endif
-                    }
+                        hal::warpPerspectiveBlocklineNN(M, xy, X0, Y0, W0, bw);
                    else
-                    {
-                        short* alpha = A + y1*bw;
-
-                        #if CV_TRY_SSE4_1
-                        if (pwarp_impl_sse4)
-                            pwarp_impl_sse4->process(M, xy, alpha, X0, Y0, W0, bw);
-                        else
-                        #endif
-                        #if CV_SIMD128_64F
-                        WarpPerspectiveLine_Process_CV_SIMD(M, xy, alpha, X0, Y0, W0, bw);
-                        #else
-                        for( int x1 = 0; x1 < bw; x1++ )
-                        {
-                            double W = W0 + M[6]*x1;
-                            W = W ? INTER_TAB_SIZE/W : 0;
-                            double fX = std::max((double)INT_MIN, std::min((double)INT_MAX, (X0 + M[0]*x1)*W));
-                            double fY = std::max((double)INT_MIN, std::min((double)INT_MAX, (Y0 + M[3]*x1)*W));
-                            int X = saturate_cast<int>(fX);
-                            int Y = saturate_cast<int>(fY);
-
-                            xy[x1*2] = saturate_cast<short>(X >> INTER_BITS);
-                            xy[x1*2+1] = saturate_cast<short>(Y >> INTER_BITS);
-                            alpha[x1] = (short)((Y & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE +
-                                                (X & (INTER_TAB_SIZE-1)));
-                        }
-                        #endif
-                    }
+                        hal::warpPerspectiveBlockline(M, xy, A + y1*bw, X0, Y0, W0, bw);
                }

                if( interpolation == INTER_NEAREST )
@ -3272,6 +3225,74 @@ void warpPerspective(int src_type,
    parallel_for_(range, invoker, dst.total()/(double)(1<<16));
 }

+void warpPerspectiveBlocklineNN(const double *M, short* xy, double X0, double Y0, double W0, int bw)
+{
+    CALL_HAL(warpPerspectiveBlocklineNN, cv_hal_warpPerspectiveBlocklineNN, M, xy, X0, Y0, W0, bw);
+
+    #if CV_TRY_SSE4_1
+    Ptr<opt_SSE4_1::WarpPerspectiveLine_SSE4> pwarp_impl_sse4;
+    if(CV_CPU_HAS_SUPPORT_SSE4_1)
+        pwarp_impl_sse4 = opt_SSE4_1::WarpPerspectiveLine_SSE4::getImpl(M);
+
+    if (pwarp_impl_sse4)
+        pwarp_impl_sse4->processNN(M, xy, X0, Y0, W0, bw);
+    else
+    #endif
+    {
+        #if CV_SIMD128_64F
+        WarpPerspectiveLine_ProcessNN_CV_SIMD(M, xy, X0, Y0, W0, bw);
+        #else
+        for( int x1 = 0; x1 < bw; x1++ )
+        {
+            double W = W0 + M[6]*x1;
+            W = W ? 1./W : 0;
+            double fX = std::max((double)INT_MIN, std::min((double)INT_MAX, (X0 + M[0]*x1)*W));
+            double fY = std::max((double)INT_MIN, std::min((double)INT_MAX, (Y0 + M[3]*x1)*W));
+            int X = saturate_cast<int>(fX);
+            int Y = saturate_cast<int>(fY);
+
+            xy[x1*2] = saturate_cast<short>(X);
+            xy[x1*2+1] = saturate_cast<short>(Y);
+        }
+        #endif
+    }
+}
+
+void warpPerspectiveBlockline(const double *M, short* xy, short* alpha, double X0, double Y0, double W0, int bw)
+{
+    CALL_HAL(warpPerspectiveBlockline, cv_hal_warpPerspectiveBlockline, M, xy, alpha, X0, Y0, W0, bw);
+
+    #if CV_TRY_SSE4_1
+    Ptr<opt_SSE4_1::WarpPerspectiveLine_SSE4> pwarp_impl_sse4;
+    if(CV_CPU_HAS_SUPPORT_SSE4_1)
+        pwarp_impl_sse4 = opt_SSE4_1::WarpPerspectiveLine_SSE4::getImpl(M);
+
+    if (pwarp_impl_sse4)
+        pwarp_impl_sse4->process(M, xy, alpha, X0, Y0, W0, bw);
+    else
+    #endif
+    {
+        #if CV_SIMD128_64F
+        WarpPerspectiveLine_Process_CV_SIMD(M, xy, alpha, X0, Y0, W0, bw);
+        #else
+        for( int x1 = 0; x1 < bw; x1++ )
+        {
+            double W = W0 + M[6]*x1;
+            W = W ? INTER_TAB_SIZE/W : 0;
+            double fX = std::max((double)INT_MIN, std::min((double)INT_MAX, (X0 + M[0]*x1)*W));
+            double fY = std::max((double)INT_MIN, std::min((double)INT_MAX, (Y0 + M[3]*x1)*W));
+            int X = saturate_cast<int>(fX);
+            int Y = saturate_cast<int>(fY);
+
+            xy[x1*2] = saturate_cast<short>(X >> INTER_BITS);
+            xy[x1*2+1] = saturate_cast<short>(Y >> INTER_BITS);
+            alpha[x1] = (short)((Y & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE +
+                                (X & (INTER_TAB_SIZE-1)));
+        }
+        #endif
+    }
+}
+
 } // hal::
 } // cv::

--- a/modules/imgproc/test/test_color.cpp
+++ b/modules/imgproc/test/test_color.cpp
@ -2657,7 +2657,7 @@ TEST(Imgproc_ColorLab_Full, bitExactness)
            Mat probe(256, 256, CV_8UC3), result;
            rng.fill(probe, RNG::UNIFORM, 0, 255, true);

-            cvtColor(probe, result, codes[c]);
+            cvtColor(probe, result, codes[c], 0, ALGO_HINT_ACCURATE);

            uint32_t h = adler32(result);
            uint32_t goodHash = hashes[c*nIterations + iter];
@ -2749,7 +2749,7 @@ TEST(Imgproc_ColorLuv_Full, bitExactness)
            Mat probe(256, 256, CV_8UC3), result;
            rng.fill(probe, RNG::UNIFORM, 0, 255, true);

-            cvtColor(probe, result, codes[c]);
+            cvtColor(probe, result, codes[c], 0, ALGO_HINT_ACCURATE);

            uint32_t h = adler32(result);
            uint32_t goodHash = hashes[c*nIterations + iter];
@ -2808,7 +2808,7 @@ void runCvtColorBitExactCheck(ColorConversionCodes code, int inputType, uint32_t
    Mat dst;
    rng.fill(src, RNG::UNIFORM, 0, 255, true);

-    cv::cvtColor(src, dst, code);
+    cv::cvtColor(src, dst, code, 0, ALGO_HINT_ACCURATE);

    uint32_t dst_hash = adler32(dst);

--- a/modules/js/perf/perf_64bits.js
+++ b/modules/js/perf/perf_64bits.js
@ -1,4 +1,4 @@
-const isNodeJs = (typeof window) === 'undefined'? true : false;
+var isNodeJs = (typeof window) === 'undefined'? true : false;

 if (isNodeJs) {
  var Benchmark = require('benchmark');
--- a/modules/js/perf/perf_helpfunc.js
+++ b/modules/js/perf/perf_helpfunc.js
@ -1,4 +1,4 @@
-const isNodeJs = (typeof window) === 'undefined'? true : false;
+var isNodeJs = (typeof window) === 'undefined'? true : false;

 if(isNodeJs) {
  var Base = require("./base");
--- a/modules/js/perf/perf_imgproc/perf_blur.js
+++ b/modules/js/perf/perf_imgproc/perf_blur.js
@ -1,4 +1,4 @@
-const isNodeJs = (typeof window) === 'undefined'? true : false;
+var isNodeJs = (typeof window) === 'undefined'? true : false;

 if　(isNodeJs)　{
  var Benchmark = require('benchmark');
--- a/modules/js/perf/perf_imgproc/perf_cvtcolor.js
+++ b/modules/js/perf/perf_imgproc/perf_cvtcolor.js
@ -1,4 +1,4 @@
-const isNodeJs = (typeof window) === 'undefined'? true : false;
+var isNodeJs = (typeof window) === 'undefined'? true : false;

 if (isNodeJs) {
  var Benchmark = require('benchmark');
--- a/modules/js/perf/perf_imgproc/perf_dilate.js
+++ b/modules/js/perf/perf_imgproc/perf_dilate.js
@ -1,4 +1,4 @@
-const isNodeJs = (typeof window) === 'undefined'? true : false;
+var isNodeJs = (typeof window) === 'undefined'? true : false;

 if　(isNodeJs)　{
  var Benchmark = require('benchmark');
--- a/modules/js/perf/perf_imgproc/perf_erode.js
+++ b/modules/js/perf/perf_imgproc/perf_erode.js
@ -1,4 +1,4 @@
-const isNodeJs = (typeof window) === 'undefined'? true : false;
+var isNodeJs = (typeof window) === 'undefined'? true : false;

 if　(isNodeJs)　{
  var Benchmark = require('benchmark');
--- a/modules/js/perf/perf_imgproc/perf_filter2D.js
+++ b/modules/js/perf/perf_imgproc/perf_filter2D.js
@ -1,4 +1,4 @@
-const isNodeJs = (typeof window) === 'undefined'? true : false;
+var isNodeJs = (typeof window) === 'undefined'? true : false;

 if　(isNodeJs)　{
  var Benchmark = require('benchmark');
--- a/modules/js/perf/perf_imgproc/perf_gaussianBlur.js
+++ b/modules/js/perf/perf_imgproc/perf_gaussianBlur.js
@ -1,4 +1,4 @@
-const isNodeJs = (typeof window) === 'undefined'? true : false;
+var isNodeJs = (typeof window) === 'undefined'? true : false;

 if　(isNodeJs)　{
  var Benchmark = require('benchmark');
--- a/modules/js/perf/perf_imgproc/perf_medianBlur.js
+++ b/modules/js/perf/perf_imgproc/perf_medianBlur.js
@ -1,4 +1,4 @@
-const isNodeJs = (typeof window) === 'undefined'? true : false;
+var isNodeJs = (typeof window) === 'undefined'? true : false;

 if　(isNodeJs)　{
  var Benchmark = require('benchmark');
--- a/modules/js/perf/perf_imgproc/perf_pyrDown.js
+++ b/modules/js/perf/perf_imgproc/perf_pyrDown.js
@ -1,4 +1,4 @@
-const isNodeJs = (typeof window) === 'undefined'? true : false;
+var isNodeJs = (typeof window) === 'undefined'? true : false;

 if　(isNodeJs)　{
  var Benchmark = require('benchmark');
--- a/modules/js/perf/perf_imgproc/perf_remap.js
+++ b/modules/js/perf/perf_imgproc/perf_remap.js
@ -1,4 +1,4 @@
-const isNodeJs = (typeof window) === 'undefined'? true : false;
+var isNodeJs = (typeof window) === 'undefined'? true : false;

 if　(isNodeJs)　{
  var Benchmark = require('benchmark');
--- a/modules/js/perf/perf_imgproc/perf_resize.js
+++ b/modules/js/perf/perf_imgproc/perf_resize.js
@ -1,4 +1,4 @@
-const isNodeJs = (typeof window) === 'undefined'? true : false;
+var isNodeJs = (typeof window) === 'undefined'? true : false;

 if　(isNodeJs)　{
  var Benchmark = require('benchmark');
--- a/modules/js/perf/perf_imgproc/perf_scharr.js
+++ b/modules/js/perf/perf_imgproc/perf_scharr.js
@ -1,4 +1,4 @@
-const isNodeJs = (typeof window) === 'undefined'? true : false;
+var isNodeJs = (typeof window) === 'undefined'? true : false;

 if　(isNodeJs)　{
  var Benchmark = require('benchmark');
--- a/modules/js/perf/perf_imgproc/perf_sobel.js
+++ b/modules/js/perf/perf_imgproc/perf_sobel.js
@ -1,4 +1,4 @@
-const isNodeJs = (typeof window) === 'undefined'? true : false;
+var isNodeJs = (typeof window) === 'undefined'? true : false;

 if　(isNodeJs)　{
  var Benchmark = require('benchmark');
--- a/modules/js/perf/perf_imgproc/perf_threshold.js
+++ b/modules/js/perf/perf_imgproc/perf_threshold.js
@ -1,4 +1,4 @@
-const isNodeJs = (typeof window) === 'undefined'? true : false;
+var isNodeJs = (typeof window) === 'undefined'? true : false;

 if　(isNodeJs)　{
  var Benchmark = require('benchmark');
--- a/modules/js/perf/perf_imgproc/perf_warpAffine.js
+++ b/modules/js/perf/perf_imgproc/perf_warpAffine.js
@ -1,4 +1,4 @@
-const isNodeJs = (typeof window) === 'undefined'? true : false;
+var isNodeJs = (typeof window) === 'undefined'? true : false;

 if　(isNodeJs)　{
  var Benchmark = require('benchmark');
--- a/modules/js/perf/perf_imgproc/perf_warpPerspective.js
+++ b/modules/js/perf/perf_imgproc/perf_warpPerspective.js
@ -1,4 +1,4 @@
-const isNodeJs = (typeof window) === 'undefined'? true : false;
+var isNodeJs = (typeof window) === 'undefined'? true : false;

 if　(isNodeJs)　{
  var Benchmark = require('benchmark');
--- a/modules/python/src2/gen2.py
+++ b/modules/python/src2/gen2.py
@ -854,7 +854,22 @@ class FuncInfo(object):

        all_code_variants = []

+        # See https://github.com/opencv/opencv/issues/25928
+        # Conversion to UMat is expensive more than conversion to Mat.
+        # To reduce this cost, conversion to Mat is prefer than to UMat.
+        variants = []
+        variants_umat = []
        for v in self.variants:
+            hasUMat = False
+            for a in v.args:
+                hasUMat = hasUMat or "UMat" in a.tp
+            if hasUMat :
+                variants_umat.append(v)
+            else:
+                variants.append(v)
+        variants.extend(variants_umat)
+
+        for v in variants:
            code_decl = ""
            code_ret = ""
            code_cvt_list = []
--- a/modules/python/test/test_misc.py
+++ b/modules/python/test/test_misc.py
@ -958,7 +958,7 @@ class CanUsePurePythonModuleFunction(NewOpenCVTests):
 class SamplesFindFile(NewOpenCVTests):

    def test_ExistedFile(self):
-        res = cv.samples.findFile('lena.jpg', False)
+        res = cv.samples.findFile('HappyFish.jpg', False)
        self.assertNotEqual(res, '')

    def test_MissingFile(self):
--- a/modules/videoio/cmake/detect_obsensor.cmake
+++ b/modules/videoio/cmake/detect_obsensor.cmake
@ -1,7 +1,7 @@
 # --- obsensor ---
 if(NOT HAVE_OBSENSOR)
  if(OBSENSOR_USE_ORBBEC_SDK)
-    include(${CMAKE_SOURCE_DIR}/3rdparty/orbbecsdk/orbbecsdk.cmake)
+    include("${OpenCV_SOURCE_DIR}/3rdparty/orbbecsdk/orbbecsdk.cmake")
    download_orbbec_sdk(ORBBEC_SDK_ROOT_DIR)
    message(STATUS "ORBBEC_SDK_ROOT_DIR: ${ORBBEC_SDK_ROOT_DIR}")
    if(ORBBEC_SDK_ROOT_DIR)