Merge pull request #23098 from savuor:nanMask

finiteMask() and doubles for patchNaNs() #23098 Related to #22826 Connected PR in extra: [#1037@extra](https://github.com/opencv/opencv_extra/pull/1037) ### TODOs: - [ ] Vectorize `finiteMask()` for 64FC3 and 64FC4 ### Changes This PR: * adds a new function `finiteMask()` * extends `patchNaNs()` by CV_64F support * moves `patchNaNs()` and `finiteMask()` to a separate file **NOTE:** now the function is called `finiteMask()` as discussed with the OpenCV core team ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [x] The PR is proposed to the proper branch - [x] There is a reference to the original bug report and related work - [x] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [x] The feature is well documented and sample code can be built with the project CMake
2 years ago · 53aad98a1a
parent 34f34f6227
commit 53aad98a1a
15 changed files with 1190 additions and 138 deletions
--- a/modules/3d/perf/perf_tsdf.cpp
+++ b/modules/3d/perf/perf_tsdf.cpp
@ -302,6 +302,9 @@ void renderPointsNormals(InputArray _points, InputArray _normals, OutputArray im

    Mat_<Vec4b> img = image.getMat();

+    Mat goods;
+    finiteMask(points, goods);
+
    Range range(0, sz.height);
    const int nstripes = -1;
    parallel_for_(range, [&](const Range&)
@ -311,6 +314,7 @@ void renderPointsNormals(InputArray _points, InputArray _normals, OutputArray im
                Vec4b* imgRow = img[y];
                const ptype* ptsRow = points[y];
                const ptype* nrmRow = normals[y];
+                const uchar* goodRow = goods.ptr<uchar>(y);

                for (int x = 0; x < sz.width; x++)
                {
@ -319,7 +323,7 @@ void renderPointsNormals(InputArray _points, InputArray _normals, OutputArray im

                    Vec4b color;

-                    if (cvIsNaN(p.x) || cvIsNaN(p.y) || cvIsNaN(p.z) )
+                    if ( !goodRow[x] )
                    {
                        color = Vec4b(0, 32, 0, 0);
                    }
@ -357,6 +361,11 @@ void renderPointsNormalsColors(InputArray _points, InputArray, InputArray _color
    Points  points = _points.getMat();
    Colors  colors = _colors.getMat();

+    Mat goods, goodp, goodc;
+    finiteMask(points, goodp);
+    finiteMask(colors, goodc);
+    goods = goodp & goodc;
+
    Mat_<Vec4b> img = image.getMat();

    Range range(0, sz.height);
@ -366,18 +375,16 @@ void renderPointsNormalsColors(InputArray _points, InputArray, InputArray _color
            for (int y = range.start; y < range.end; y++)
            {
                Vec4b* imgRow = img[y];
-                const ptype* ptsRow = points[y];
                const ptype* clrRow = colors[y];
+                const uchar* goodRow = goods.ptr<uchar>(y);

                for (int x = 0; x < sz.width; x++)
                {
-                    Point3f p = fromPtype(ptsRow[x]);
                    Point3f c = fromPtype(clrRow[x]);

                    Vec4b color;

-                    if (cvIsNaN(p.x) || cvIsNaN(p.y) || cvIsNaN(p.z)
-                        || cvIsNaN(c.x) || cvIsNaN(c.y) || cvIsNaN(c.z))
+                    if ( !goodRow[x] )
                    {
                        color = Vec4b(0, 32, 0, 0);
                    }
--- a/modules/3d/src/rgbd/odometry_functions.cpp
+++ b/modules/3d/src/rgbd/odometry_functions.cpp
@ -102,16 +102,8 @@ static void extendPyrMaskByPyrNormals(const std::vector<UMat>& pyramidNormals,
            UMat maski = pyramidMask[i];
            UMat normali = pyramidNormals[i];
            UMat validNormalMask;
-            // NaN check
-            cv::compare(normali, normali, validNormalMask, CMP_EQ);
-            CV_Assert(validNormalMask.type() == CV_8UC4);
-
-            std::vector<UMat> channelMasks;
-            split(validNormalMask, channelMasks);
-            UMat tmpChMask;
-            cv::bitwise_and(channelMasks[0], channelMasks[1], tmpChMask);
-            cv::bitwise_and(channelMasks[2], tmpChMask, tmpChMask);
-            cv::bitwise_and(maski, tmpChMask, maski);
+            finiteMask(normali, validNormalMask);
+            cv::bitwise_and(maski, validNormalMask, maski);
        }
    }
 }
@ -727,7 +719,7 @@ void computeCorresps(const Matx33f& _K, const Mat& rt,
        {
            float ddst = depthDst_row[udst];

-            if (maskDst_row[udst] && !cvIsNaN(ddst))
+            if (maskDst_row[udst])
            {
                float transformed_ddst = static_cast<float>(ddst * (KRK_inv6_u1[udst] + KRK_inv7_v1_plus_KRK_inv8[vdst]) + ktinv.z);

--- a/modules/3d/test/test_odometry.cpp
+++ b/modules/3d/test/test_odometry.cpp
@ -16,11 +16,8 @@ void dilateFrame(Mat& image, Mat& depth)
    CV_Assert(depth.type() == CV_32FC1);
    CV_Assert(depth.size() == image.size());

-    Mat mask(image.size(), CV_8UC1, Scalar(255));
-    for(int y = 0; y < depth.rows; y++)
-        for(int x = 0; x < depth.cols; x++)
-            if(cvIsNaN(depth.at<float>(y,x)) || depth.at<float>(y,x) > 10 || depth.at<float>(y,x) <= FLT_EPSILON)
-                mask.at<uchar>(y,x) = 0;
+    Mat mask;
+    inRange(depth, FLT_EPSILON, 10, mask);

    image.setTo(255, ~mask);
    Mat minImage;
@ -726,7 +723,8 @@ TEST(RGBD_Odometry_WarpFrame, nansAreMasked)

    ASSERT_EQ(0, rgbDiff);

-    Mat goodVals = (w.warpedDepth == w.warpedDepth);
+    Mat goodVals;
+    finiteMask(w.warpedDepth, goodVals);

    double l2diff = cv::norm(w.dstDepth, w.warpedDepth, NORM_L2, goodVals);
    double lidiff = cv::norm(w.dstDepth, w.warpedDepth, NORM_INF, goodVals);
--- a/modules/3d/test/test_tsdf.cpp
+++ b/modules/3d/test/test_tsdf.cpp
@ -295,6 +295,9 @@ void renderPointsNormals(InputArray _points, InputArray _normals, OutputArray im
    Points  points = _points.getMat();
    Normals normals = _normals.getMat();

+    Mat goods;
+    finiteMask(points, goods);
+
    Mat_<Vec4b> img = image.getMat();

    Range range(0, sz.height);
@ -306,6 +309,7 @@ void renderPointsNormals(InputArray _points, InputArray _normals, OutputArray im
                Vec4b* imgRow = img[y];
                const ptype* ptsRow = points[y];
                const ptype* nrmRow = normals[y];
+                const uchar* goodRow = goods.ptr<uchar>(y);

                for (int x = 0; x < sz.width; x++)
                {
@ -314,7 +318,7 @@ void renderPointsNormals(InputArray _points, InputArray _normals, OutputArray im

                    Vec4b color;

-                    if (cvIsNaN(p.x) || cvIsNaN(p.y) || cvIsNaN(p.z))
+                    if (!goodRow[x])
                    {
                        color = Vec4b(0, 32, 0, 0);
                    }
@ -352,6 +356,11 @@ void renderPointsNormalsColors(InputArray _points, InputArray, InputArray _color
    Points  points  = _points.getMat();
    Colors  colors  = _colors.getMat();

+    Mat goods, goodc, goodp;
+    finiteMask(points, goodp);
+    finiteMask(colors, goodc);
+    goods = goodp & goodc;
+
    Mat_<Vec4b> img = image.getMat();

    Range range(0, sz.height);
@ -361,18 +370,16 @@ void renderPointsNormalsColors(InputArray _points, InputArray, InputArray _color
            for (int y = range.start; y < range.end; y++)
            {
                Vec4b* imgRow = img[y];
-                const ptype* ptsRow = points[y];
                const ptype* clrRow = colors[y];
+                const uchar* goodRow = goods.ptr<uchar>(y);

                for (int x = 0; x < sz.width; x++)
                {
-                    Point3f p = fromPtype(ptsRow[x]);
                    Point3f c = fromPtype(clrRow[x]);

                    Vec4b color;

-                    if (cvIsNaN(p.x) || cvIsNaN(p.y) || cvIsNaN(p.z)
-                        || cvIsNaN(c.x) || cvIsNaN(c.y) || cvIsNaN(c.z))
+                    if (!goodRow[x])
                    {
                        color = Vec4b(0, 32, 0, 0);
                    }
@ -587,33 +594,6 @@ void boundingBoxGrowthTest(bool enableGrowth)
 }


-static Mat nanMask(Mat img)
-{
-    int depth = img.depth();
-    Mat mask(img.size(), CV_8U);
-    for (int y = 0; y < img.rows; y++)
-    {
-        uchar *maskRow = mask.ptr<uchar>(y);
-        if (depth == CV_32F)
-        {
-            Vec4f *imgrow = img.ptr<Vec4f>(y);
-            for (int x = 0; x < img.cols; x++)
-            {
-                maskRow[x] = (imgrow[x] == imgrow[x]) * 255;
-            }
-        }
-        else if (depth == CV_64F)
-        {
-            Vec4d *imgrow = img.ptr<Vec4d>(y);
-            for (int x = 0; x < img.cols; x++)
-            {
-                maskRow[x] = (imgrow[x] == imgrow[x]) * 255;
-            }
-        }
-    }
-    return mask;
-}
-
 template <typename VT>
 static Mat_<typename VT::value_type> normalsErrorT(Mat_<VT> srcNormals, Mat_<VT> dstNormals)
 {
@ -725,8 +705,9 @@ void regressionVolPoseRot()
    split(uptsRot, ptsRotCh);
    Mat maskPts0 = ptsCh[2] > 0;
    Mat maskPtsRot = ptsRotCh[2] > 0;
-    Mat maskNrm0 = nanMask(mnrm);
-    Mat maskNrmRot = nanMask(mnrmRot);
+    Mat maskNrm0, maskNrmRot;
+    finiteMask(mnrm, maskNrm0);
+    finiteMask(mnrmRot, maskNrmRot);
    Mat maskPtsDiff, maskNrmDiff;
    cv::bitwise_xor(maskPts0, maskPtsRot, maskPtsDiff);
    cv::bitwise_xor(maskNrm0, maskNrmRot, maskNrmDiff);
--- a/modules/core/CMakeLists.txt
+++ b/modules/core/CMakeLists.txt
@ -10,6 +10,7 @@ ocv_add_dispatched_file(has_non_zero SSE2 AVX2)
 ocv_add_dispatched_file(matmul SSE2 SSE4_1 AVX2 AVX512_SKX NEON_DOTPROD)
 ocv_add_dispatched_file(mean SSE2 AVX2)
 ocv_add_dispatched_file(merge SSE2 AVX2)
+ocv_add_dispatched_file(nan_mask SSE2 AVX2)
 ocv_add_dispatched_file(split SSE2 AVX2)
 ocv_add_dispatched_file(sum SSE2 AVX2)

--- a/modules/core/include/opencv2/core.hpp
+++ b/modules/core/include/opencv2/core.hpp
@ -1697,12 +1697,21 @@ elements.
 CV_EXPORTS_W bool checkRange(InputArray a, bool quiet = true, CV_OUT Point* pos = 0,
                            double minVal = -DBL_MAX, double maxVal = DBL_MAX);

-/** @brief converts NaNs to the given number
-@param a input/output matrix (CV_32F type).
+/** @brief Replaces NaNs by given number
+@param a input/output matrix (CV_32F or CV_64F type)
@param val value to convert the NaNs
 */
 CV_EXPORTS_W void patchNaNs(InputOutputArray a, double val = 0);

+/** @brief Generates a mask of finite float values, i.e. not NaNs nor Infs.
+
+An element is set to to 255 (all 1-bits) if all channels are finite.
+@param src Input matrix, should contain float or double elements of 1 to 4 channels
+@param mask Output matrix of the same size as input of type CV_8UC1
+*/
+CV_EXPORTS_W void finiteMask(InputArray src, OutputArray mask);
+
+
 /** @brief Performs generalized matrix multiplication.

 The function cv::gemm performs generalized matrix multiplication similar to the
--- a/modules/core/perf/opencl/perf_arithm.cpp
+++ b/modules/core/perf/opencl/perf_arithm.cpp
@ -41,6 +41,7 @@

 #include "../perf_precomp.hpp"
 #include "opencv2/ts/ocl_perf.hpp"
+#include "opencv2/core/softfloat.hpp"

 #ifdef HAVE_OPENCL

@ -1038,14 +1039,40 @@ OCL_PERF_TEST_P(ConvertScaleAbsFixture, ConvertScaleAbs,

 ///////////// PatchNaNs ////////////////////////

+template<typename _Tp>
+_Tp randomNan(RNG& rng);
+
+template<>
+float randomNan(RNG& rng)
+{
+    uint32_t r = rng.next();
+    Cv32suf v;
+    v.u = r;
+    // exp & set a bit to avoid zero mantissa
+    v.u = v.u | 0x7f800001;
+    return v.f;
+}
+
+template<>
+double randomNan(RNG& rng)
+{
+    uint32_t r0 = rng.next();
+    uint32_t r1 = rng.next();
+    Cv64suf v;
+    v.u = (uint64_t(r0) << 32) | uint64_t(r1);
+    // exp &set a bit to avoid zero mantissa
+    v.u = v.u | 0x7ff0000000000001;
+    return v.f;
+}
+
 typedef Size_MatType PatchNaNsFixture;

 OCL_PERF_TEST_P(PatchNaNsFixture, PatchNaNs,
-                ::testing::Combine(OCL_TEST_SIZES, OCL_PERF_ENUM(CV_32FC1, CV_32FC4)))
+                ::testing::Combine(OCL_TEST_SIZES, OCL_PERF_ENUM(CV_32FC1, CV_32FC3, CV_32FC4, CV_64FC1, CV_64FC3, CV_64FC4)))
 {
    const Size_MatType_t params = GetParam();
    Size srcSize = get<0>(params);
-    const int type = get<1>(params), cn = CV_MAT_CN(type);
+    const int type = get<1>(params), cn = CV_MAT_CN(type), depth = CV_MAT_DEPTH(type);

    checkDeviceMaxMemoryAllocSize(srcSize, type);

@ -1056,11 +1083,22 @@ OCL_PERF_TEST_P(PatchNaNsFixture, PatchNaNs,
    {
        Mat src_ = src.getMat(ACCESS_RW);
        srcSize.width *= cn;
+        RNG& rng = theRNG();
        for (int y = 0; y < srcSize.height; ++y)
        {
-            float * const ptr = src_.ptr<float>(y);
+            float  *const ptrf = src_.ptr<float>(y);
+            double *const ptrd = src_.ptr<double>(y);
            for (int x = 0; x < srcSize.width; ++x)
-                ptr[x] = (x + y) % 2 == 0 ? std::numeric_limits<float>::quiet_NaN() : ptr[x];
+            {
+                if (depth == CV_32F)
+                {
+                    ptrf[x] = (x + y) % 2 == 0 ? randomNan<float >(rng) : ptrf[x];
+                }
+                else if (depth == CV_64F)
+                {
+                    ptrd[x] = (x + y) % 2 == 0 ? randomNan<double>(rng) : ptrd[x];
+                }
+            }
        }
    }

@ -1069,6 +1107,57 @@ OCL_PERF_TEST_P(PatchNaNsFixture, PatchNaNs,
    SANITY_CHECK(src);
 }

+////////////// finiteMask ////////////////////////
+
+typedef Size_MatType FiniteMaskFixture;
+
+OCL_PERF_TEST_P(FiniteMaskFixture, FiniteMask,
+                ::testing::Combine(OCL_TEST_SIZES, OCL_PERF_ENUM(CV_32FC1, CV_32FC3, CV_32FC4, CV_64FC1, CV_64FC3, CV_64FC4)))
+{
+    const Size_MatType_t params = GetParam();
+    Size srcSize = get<0>(params);
+    const int type = get<1>(params), cn = CV_MAT_CN(type), depth = CV_MAT_DEPTH(type);
+
+    checkDeviceMaxMemoryAllocSize(srcSize, type);
+
+    UMat src(srcSize, type);
+    UMat mask(srcSize, CV_8UC1);
+    declare.in(src, WARMUP_RNG).out(mask);
+
+    // generating NaNs
+    {
+        Mat src_ = src.getMat(ACCESS_RW);
+        srcSize.width *= cn;
+        const softfloat  fpinf = softfloat ::inf();
+        const softfloat  fninf = softfloat ::inf().setSign(true);
+        const softdouble dpinf = softdouble::inf();
+        const softdouble dninf = softdouble::inf().setSign(true);
+        RNG& rng = theRNG();
+        for (int y = 0; y < srcSize.height; ++y)
+        {
+            float  *const ptrf = src_.ptr<float>(y);
+            double *const ptrd = src_.ptr<double>(y);
+            for (int x = 0; x < srcSize.width; ++x)
+            {
+                int rem = (x + y) % 10;
+                if (depth == CV_32F)
+                {
+                    ptrf[x] = rem <  4 ? randomNan<float >(rng) :
+                              rem == 5 ? (float )((x + y)%2 ? fpinf : fninf) : ptrf[x];
+                }
+                else if (depth == CV_64F)
+                {
+                    ptrd[x] = rem <  4 ? randomNan<double>(rng) :
+                              rem == 5 ? (double)((x + y)%2 ? dpinf : dninf) : ptrd[x];
+                }
+            }
+        }
+    }
+
+    OCL_TEST_CYCLE() cv::finiteMask(src, mask);
+
+    SANITY_CHECK(mask);
+}

 ///////////// ScaleAdd ////////////////////////

--- a/modules/core/perf/perf_arithm.cpp
+++ b/modules/core/perf/perf_arithm.cpp
@ -1,4 +1,5 @@
 #include "perf_precomp.hpp"
+#include "opencv2/core/softfloat.hpp"
 #include <numeric>

 namespace opencv_test
@ -451,4 +452,135 @@ INSTANTIATE_TEST_CASE_P(/*nothing*/ , BinaryOpTest,
    )
 );

+
+///////////// PatchNaNs ////////////////////////
+
+template<typename _Tp>
+_Tp randomNan(RNG& rng);
+
+template<>
+float randomNan(RNG& rng)
+{
+    uint32_t r = rng.next();
+    Cv32suf v;
+    v.u = r;
+    // exp & set a bit to avoid zero mantissa
+    v.u = v.u | 0x7f800001;
+    return v.f;
+}
+
+template<>
+double randomNan(RNG& rng)
+{
+    uint32_t r0 = rng.next();
+    uint32_t r1 = rng.next();
+    Cv64suf v;
+    v.u = (uint64_t(r0) << 32) | uint64_t(r1);
+    // exp &set a bit to avoid zero mantissa
+    v.u = v.u | 0x7ff0000000000001;
+    return v.f;
+}
+
+typedef Size_MatType PatchNaNsFixture;
+
+PERF_TEST_P_(PatchNaNsFixture, PatchNaNs)
+{
+    const Size_MatType_t params = GetParam();
+    Size srcSize = get<0>(params);
+    const int type = get<1>(params), cn = CV_MAT_CN(type), depth = CV_MAT_DEPTH(type);
+
+    Mat src(srcSize, type);
+    declare.in(src, WARMUP_RNG).out(src);
+
+    // generating NaNs
+    {
+        srcSize.width *= cn;
+        RNG& rng = theRNG();
+        for (int y = 0; y < srcSize.height; ++y)
+        {
+            float  *const ptrf = src.ptr<float>(y);
+            double *const ptrd = src.ptr<double>(y);
+            for (int x = 0; x < srcSize.width; ++x)
+            {
+                if (depth == CV_32F)
+                {
+                    ptrf[x] = (x + y) % 2 == 0 ? randomNan<float >(rng) : ptrf[x];
+                }
+                else if (depth == CV_64F)
+                {
+                    ptrd[x] = (x + y) % 2 == 0 ? randomNan<double>(rng) : ptrd[x];
+                }
+            }
+        }
+    }
+
+    TEST_CYCLE() cv::patchNaNs(src, 17.7);
+
+    SANITY_CHECK(src);
+}
+
+INSTANTIATE_TEST_CASE_P(/*nothing*/ , PatchNaNsFixture,
+    testing::Combine(
+        testing::Values(szVGA, sz720p, sz1080p, sz2160p),
+        testing::Values(CV_32FC1, CV_32FC2, CV_32FC3, CV_32FC4, CV_64FC1, CV_64FC2, CV_64FC3, CV_64FC4)
+    )
+);
+
+
+////////////// finiteMask ////////////////////////
+
+typedef Size_MatType FiniteMaskFixture;
+
+PERF_TEST_P_(FiniteMaskFixture, FiniteMask)
+{
+    const Size_MatType_t params = GetParam();
+    Size srcSize = get<0>(params);
+    const int type = get<1>(params), cn = CV_MAT_CN(type), depth = CV_MAT_DEPTH(type);
+
+    Mat src(srcSize, type);
+    Mat mask(srcSize, CV_8UC1);
+    declare.in(src, WARMUP_RNG).out(mask);
+
+    // generating NaNs
+    {
+        srcSize.width *= cn;
+        const softfloat  fpinf = softfloat ::inf();
+        const softfloat  fninf = softfloat ::inf().setSign(true);
+        const softdouble dpinf = softdouble::inf();
+        const softdouble dninf = softdouble::inf().setSign(true);
+        RNG& rng = theRNG();
+        for (int y = 0; y < srcSize.height; ++y)
+        {
+            float  *const ptrf = src.ptr<float>(y);
+            double *const ptrd = src.ptr<double>(y);
+            for (int x = 0; x < srcSize.width; ++x)
+            {
+                int rem = (x + y) % 10;
+                if (depth == CV_32F)
+                {
+                    ptrf[x] = rem <  4 ? randomNan<float >(rng) :
+                              rem == 5 ? (float )((x + y)%2 ? fpinf : fninf) : ptrf[x];
+                }
+                else if (depth == CV_64F)
+                {
+                    ptrd[x] = rem <  4 ? randomNan<double>(rng) :
+                              rem == 5 ? (double)((x + y)%2 ? dpinf : dninf) : ptrd[x];
+                }
+            }
+        }
+    }
+
+    TEST_CYCLE() cv::finiteMask(src, mask);
+
+    SANITY_CHECK(mask);
+}
+
+INSTANTIATE_TEST_CASE_P(/*nothing*/ , FiniteMaskFixture,
+    testing::Combine(
+        testing::Values(szVGA, sz720p, sz1080p, sz2160p),
+        testing::Values(CV_32FC1, CV_32FC2, CV_32FC3, CV_32FC4, CV_64FC1, CV_64FC2, CV_64FC3, CV_64FC4)
+    )
+);
+
+
 } // namespace
--- a/modules/core/src/mathfuncs.cpp
+++ b/modules/core/src/mathfuncs.cpp
@ -1574,75 +1574,7 @@ bool checkRange(InputArray _src, bool quiet, Point* pt, double minVal, double ma
    return true;
 }

-#ifdef HAVE_OPENCL
-
-static bool ocl_patchNaNs( InputOutputArray _a, float value )
-{
-    int rowsPerWI = ocl::Device::getDefault().isIntel() ? 4 : 1;
-    ocl::Kernel k("KF", ocl::core::arithm_oclsrc,
-                     format("-D UNARY_OP -D OP_PATCH_NANS -D dstT=float -D DEPTH_dst=%d -D rowsPerWI=%d",
-                            CV_32F, rowsPerWI));
-    if (k.empty())
-        return false;
-
-    UMat a = _a.getUMat();
-    int cn = a.channels();
-
-    k.args(ocl::KernelArg::ReadOnlyNoSize(a),
-           ocl::KernelArg::WriteOnly(a, cn), (float)value);
-
-    size_t globalsize[2] = { (size_t)a.cols * cn, ((size_t)a.rows + rowsPerWI - 1) / rowsPerWI };
-    return k.run(2, globalsize, NULL, false);
-}
-
-#endif
-
-void patchNaNs( InputOutputArray _a, double _val )
-{
-    CV_INSTRUMENT_REGION();
-
-    CV_Assert( _a.depth() == CV_32F );
-
-    CV_OCL_RUN(_a.isUMat() && _a.dims() <= 2,
-               ocl_patchNaNs(_a, (float)_val))
-
-    Mat a = _a.getMat();
-    const Mat* arrays[] = {&a, 0};
-    int* ptrs[1] = {};
-    NAryMatIterator it(arrays, (uchar**)ptrs);
-    size_t len = it.size*a.channels();
-    Cv32suf val;
-    val.f = (float)_val;
-
-#if (CV_SIMD || CV_SIMD_SCALABLE)
-    v_int32 v_mask1 = vx_setall_s32(0x7fffffff), v_mask2 = vx_setall_s32(0x7f800000);
-    v_int32 v_val = vx_setall_s32(val.i);
-#endif
-
-    for( size_t i = 0; i < it.nplanes; i++, ++it )
-    {
-        int* tptr = ptrs[0];
-        size_t j = 0;
-
-#if (CV_SIMD || CV_SIMD_SCALABLE)
-        size_t cWidth = (size_t)VTraits<v_int32>::vlanes();
-        for ( ; j + cWidth <= len; j += cWidth)
-        {
-            v_int32 v_src = vx_load(tptr + j);
-            v_int32 v_cmp_mask = v_lt(v_mask2, v_and(v_src, v_mask1));
-            v_int32 v_dst = v_select(v_cmp_mask, v_val, v_src);
-            v_store(tptr + j, v_dst);
-        }
-        vx_cleanup();
-#endif
-
-        for( ; j < len; j++ )
-            if( (tptr[j] & 0x7fffffff) > 0x7f800000 )
-                tptr[j] = val.i;
-    }
-}
-
-}
+} // namespace cv


 #ifndef OPENCV_EXCLUDE_C_API
--- a/modules/core/src/nan_mask.dispatch.cpp
+++ b/modules/core/src/nan_mask.dispatch.cpp
@ -0,0 +1,152 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "precomp.hpp"
+#include "opencl_kernels_core.hpp"
+
+#include "nan_mask.simd.hpp"
+#include "nan_mask.simd_declarations.hpp" // defines CV_CPU_DISPATCH_MODES_ALL=AVX2,...,BASELINE based on CMakeLists.txt content
+
+namespace cv {
+
+#ifdef HAVE_OPENCL
+
+static bool ocl_patchNaNs( InputOutputArray _a, double value )
+{
+    int ftype = _a.depth();
+
+    const ocl::Device d = ocl::Device::getDefault();
+    bool doubleSupport = d.doubleFPConfig() > 0;
+    if (!doubleSupport && ftype == CV_64F)
+    {
+        return false;
+    }
+
+    int rowsPerWI = ocl::Device::getDefault().isIntel() ? 4 : 1;
+    ocl::Kernel k("KF", ocl::core::arithm_oclsrc,
+                     format("-D UNARY_OP -D OP_PATCH_NANS -D dstT=%s -D DEPTH_dst=%d -D rowsPerWI=%d %s",
+                            ftype == CV_64F ? "double" : "float", ftype, rowsPerWI,
+                            doubleSupport ? "-D DOUBLE_SUPPORT" : ""));
+    if (k.empty())
+        return false;
+
+    UMat a = _a.getUMat();
+    int cn = a.channels();
+
+    // to pass float or double to args
+    if (ftype == CV_32F)
+    {
+        k.args(ocl::KernelArg::ReadOnlyNoSize(a), ocl::KernelArg::WriteOnly(a, cn), (float)value);
+    }
+    else // CV_64F
+    {
+        k.args(ocl::KernelArg::ReadOnlyNoSize(a), ocl::KernelArg::WriteOnly(a, cn), value);
+    }
+
+    size_t globalsize[2] = { (size_t)a.cols * cn, ((size_t)a.rows + rowsPerWI - 1) / rowsPerWI };
+    return k.run(2, globalsize, NULL, false);
+}
+
+#endif
+
+static PatchNanFunc getPatchNanFunc(bool isDouble)
+{
+    CV_INSTRUMENT_REGION();
+    CV_CPU_DISPATCH(getPatchNanFunc, (isDouble), CV_CPU_DISPATCH_MODES_ALL);
+}
+
+void patchNaNs( InputOutputArray _a, double _val )
+{
+    CV_INSTRUMENT_REGION();
+    CV_Assert( _a.depth() == CV_32F || _a.depth() == CV_64F);
+
+    CV_OCL_RUN(_a.isUMat() && _a.dims() <= 2,
+               ocl_patchNaNs(_a, _val))
+
+    Mat a = _a.getMat();
+    const Mat* arrays[] = {&a, 0};
+    uchar* ptrs[1] = {};
+    NAryMatIterator it(arrays, ptrs);
+    size_t len = it.size*a.channels();
+
+    PatchNanFunc func = getPatchNanFunc(_a.depth() == CV_64F);
+
+    for (size_t i = 0; i < it.nplanes; i++, ++it)
+    {
+        func(ptrs[0], len, _val);
+    }
+}
+
+
+#ifdef HAVE_OPENCL
+
+static bool ocl_finiteMask(const UMat img, UMat mask)
+{
+    int channels = img.channels();
+    int depth = img.depth();
+
+    const ocl::Device d = ocl::Device::getDefault();
+    bool doubleSupport = d.doubleFPConfig() > 0;
+    if (!doubleSupport && depth == CV_64F)
+    {
+        return false;
+    }
+
+    int rowsPerWI = ocl::Device::getDefault().isIntel() ? 4 : 1;
+    ocl::Kernel k("finiteMask", ocl::core::finitemask_oclsrc,
+                  format("-D srcT=%s -D cn=%d -D rowsPerWI=%d %s",
+                         depth == CV_32F ? "float" : "double", channels, rowsPerWI,
+                         doubleSupport ? "-D DOUBLE_SUPPORT" : ""));
+    if (k.empty())
+        return false;
+
+    k.args(ocl::KernelArg::ReadOnlyNoSize(img), ocl::KernelArg::WriteOnly(mask));
+
+    size_t globalsize[2] = { (size_t)img.cols, ((size_t)img.rows + rowsPerWI - 1) / rowsPerWI };
+    return k.run(2, globalsize, NULL, false);
+}
+
+#endif
+
+static FiniteMaskFunc getFiniteMaskFunc(bool isDouble, int cn)
+{
+    CV_INSTRUMENT_REGION();
+    CV_CPU_DISPATCH(getFiniteMaskFunc, (isDouble, cn), CV_CPU_DISPATCH_MODES_ALL);
+}
+
+void finiteMask(InputArray _src, OutputArray _mask)
+{
+    CV_INSTRUMENT_REGION();
+
+    int channels = _src.channels();
+    int depth = _src.depth();
+    CV_Assert( channels > 0 && channels <= 4);
+    CV_Assert( depth == CV_32F || depth == CV_64F );
+    std::vector<int> vsz(_src.dims());
+    _src.sizend(vsz.data());
+    _mask.create(_src.dims(), vsz.data(), CV_8UC1);
+
+    CV_OCL_RUN(_src.isUMat() && _mask.isUMat() && _src.dims() <= 2,
+               ocl_finiteMask(_src.getUMat(), _mask.getUMat()));
+
+    Mat src = _src.getMat();
+    Mat mask = _mask.getMat();
+
+    const Mat *arrays[]={&src, &mask, 0};
+    Mat planes[2];
+    NAryMatIterator it(arrays, planes);
+    size_t total = planes[0].total();
+    size_t i, nplanes = it.nplanes;
+
+    FiniteMaskFunc func = getFiniteMaskFunc((depth == CV_64F), channels);
+
+    for( i = 0; i < nplanes; i++, ++it )
+    {
+        const uchar* sptr = planes[0].ptr();
+        uchar* dptr = planes[1].ptr();
+
+        func(sptr, dptr, total);
+    }
+}
+} //namespace cv
--- a/modules/core/src/nan_mask.simd.hpp
+++ b/modules/core/src/nan_mask.simd.hpp
@ -0,0 +1,440 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
+
+#include "precomp.hpp"
+
+namespace cv {
+
+typedef void (*PatchNanFunc)(uchar* tptr, size_t len, double newVal);
+typedef void (*FiniteMaskFunc)(const uchar *src, uchar *dst, size_t total);
+
+CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN
+
+PatchNanFunc getPatchNanFunc(bool isDouble);
+FiniteMaskFunc getFiniteMaskFunc(bool isDouble, int cn);
+
+#ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
+
+static void patchNaNs_32f(uchar* ptr, size_t ulen, double newVal)
+{
+    CV_INSTRUMENT_REGION();
+
+    int32_t* tptr = (int32_t*)ptr;
+    Cv32suf val;
+    val.f = (float)newVal;
+
+    int j = 0;
+    int len = (int)ulen;
+
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    v_int32 v_pos_mask = vx_setall_s32(0x7fffffff), v_exp_mask = vx_setall_s32(0x7f800000);
+    v_int32 v_val = vx_setall_s32(val.i);
+
+    int cWidth = VTraits<v_int32>::vlanes();
+    for (; j < len - cWidth*2 + 1; j += cWidth*2)
+    {
+        v_int32 v_src0 = vx_load(tptr + j);
+        v_int32 v_src1 = vx_load(tptr + j + cWidth);
+
+        v_int32 v_cmp_mask0 = v_lt(v_exp_mask, v_and(v_src0, v_pos_mask));
+        v_int32 v_cmp_mask1 = v_lt(v_exp_mask, v_and(v_src1, v_pos_mask));
+
+        if (v_check_any(v_or(v_cmp_mask0, v_cmp_mask1)))
+        {
+            v_int32 v_dst0 = v_select(v_cmp_mask0, v_val, v_src0);
+            v_int32 v_dst1 = v_select(v_cmp_mask1, v_val, v_src1);
+
+            v_store(tptr + j, v_dst0);
+            v_store(tptr + j + cWidth, v_dst1);
+        }
+    }
+#endif
+
+    for (; j < len; j++)
+    {
+        if ((tptr[j] & 0x7fffffff) > 0x7f800000)
+        {
+            tptr[j] = val.i;
+        }
+    }
+}
+
+
+static void patchNaNs_64f(uchar* ptr, size_t ulen, double newVal)
+{
+    CV_INSTRUMENT_REGION();
+
+    int64_t* tptr = (int64_t*)ptr;
+    Cv64suf val;
+    val.f = newVal;
+
+    int j = 0;
+    int len = (int)ulen;
+
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    v_int64 v_exp_mask = vx_setall_s64(0x7FF0000000000000);
+    v_int64 v_pos_mask = vx_setall_s64(0x7FFFFFFFFFFFFFFF);
+    v_int64 v_val = vx_setall_s64(val.i);
+
+    int cWidth = VTraits<v_int64>::vlanes();
+    for (; j < len - cWidth * 2 + 1; j += cWidth*2)
+    {
+        v_int64 v_src0 = vx_load(tptr + j);
+        v_int64 v_src1 = vx_load(tptr + j + cWidth);
+
+        v_int64 v_cmp_mask0 = v_lt(v_exp_mask, v_and(v_src0, v_pos_mask));
+        v_int64 v_cmp_mask1 = v_lt(v_exp_mask, v_and(v_src1, v_pos_mask));
+
+        if (v_check_any(v_cmp_mask0) || v_check_any(v_cmp_mask1))
+        {
+            // v_select is not available for v_int64, emulating it
+            v_int32 val32 = v_reinterpret_as_s32(v_val);
+            v_int64 v_dst0 = v_reinterpret_as_s64(v_select(v_reinterpret_as_s32(v_cmp_mask0), val32, v_reinterpret_as_s32(v_src0)));
+            v_int64 v_dst1 = v_reinterpret_as_s64(v_select(v_reinterpret_as_s32(v_cmp_mask1), val32, v_reinterpret_as_s32(v_src1)));
+
+            v_store(tptr + j, v_dst0);
+            v_store(tptr + j + cWidth, v_dst1);
+        }
+    }
+#endif
+
+    for (; j < len; j++)
+        if ((tptr[j] & 0x7FFFFFFFFFFFFFFF) > 0x7FF0000000000000)
+            tptr[j] = val.i;
+}
+
+PatchNanFunc getPatchNanFunc(bool isDouble)
+{
+    return isDouble ? (PatchNanFunc)GET_OPTIMIZED(patchNaNs_64f)
+                    : (PatchNanFunc)GET_OPTIMIZED(patchNaNs_32f);
+}
+
+////// finiteMask //////
+
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+
+template <typename _Tp, int cn>
+int finiteMaskSIMD_(const _Tp *src, uchar *dst, size_t total);
+
+template <>
+int finiteMaskSIMD_<float, 1>(const float *fsrc, uchar *dst, size_t utotal)
+{
+    const uint32_t* src = (const uint32_t*)fsrc;
+    const int osize = VTraits<v_uint8>::vlanes();
+    v_uint32 vmaskExp = vx_setall_u32(0x7f800000);
+
+    int i = 0;
+    int total = (int)utotal;
+    for(; i < total - osize + 1; i += osize )
+    {
+        v_uint32 vv0, vv1, vv2, vv3;
+        vv0 = v_ne(v_and(vx_load(src + i              ), vmaskExp), vmaskExp);
+        vv1 = v_ne(v_and(vx_load(src + i +   (osize/4)), vmaskExp), vmaskExp);
+        vv2 = v_ne(v_and(vx_load(src + i + 2*(osize/4)), vmaskExp), vmaskExp);
+        vv3 = v_ne(v_and(vx_load(src + i + 3*(osize/4)), vmaskExp), vmaskExp);
+
+        v_store(dst + i, v_pack_b(vv0, vv1, vv2, vv3));
+    }
+
+    return i;
+}
+
+
+template <>
+int finiteMaskSIMD_<float, 2>(const float *fsrc, uchar *dst, size_t utotal)
+{
+    const uint32_t* src = (const uint32_t*)fsrc;
+    const int size8 = VTraits<v_uint8>::vlanes();
+    v_uint32 vmaskExp = vx_setall_u32(0x7f800000);
+    v_uint16 vmaskBoth = vx_setall_u16(0xffff);
+
+    int i = 0;
+    int total = (int)utotal;
+    for(; i < total - (size8 / 2) + 1; i += (size8 / 2) )
+    {
+        v_uint32 vv0, vv1, vv2, vv3;
+        vv0 = v_ne(v_and(vx_load(src + i*2                ), vmaskExp), vmaskExp);
+        vv1 = v_ne(v_and(vx_load(src + i*2 +   (size8 / 4)), vmaskExp), vmaskExp);
+        vv2 = v_ne(v_and(vx_load(src + i*2 + 2*(size8 / 4)), vmaskExp), vmaskExp);
+        vv3 = v_ne(v_and(vx_load(src + i*2 + 3*(size8 / 4)), vmaskExp), vmaskExp);
+        v_uint8 velems = v_pack_b(vv0, vv1, vv2, vv3);
+        v_uint16 vfinite = v_eq(v_reinterpret_as_u16(velems), vmaskBoth);
+
+        // 2nd argument in vfinite is useless
+        v_store_low(dst + i, v_pack(vfinite, vfinite));
+    }
+
+    return i;
+}
+
+
+template <>
+int finiteMaskSIMD_<float, 3>(const float *fsrc, uchar *dst, size_t utotal)
+{
+    const uint32_t* src = (const uint32_t*)fsrc;
+    const int npixels = VTraits<v_float32>::vlanes();
+    v_uint32 vmaskExp = vx_setall_u32(0x7f800000);
+    v_uint32 z = vx_setzero_u32();
+
+    int i = 0;
+    int total = (int)utotal;
+    for (; i < total - npixels + 1; i += npixels)
+    {
+        v_uint32 vv0, vv1, vv2;
+        vv0 = v_ne(v_and(vx_load(src + i*3            ), vmaskExp), vmaskExp);
+        vv1 = v_ne(v_and(vx_load(src + i*3 +   npixels), vmaskExp), vmaskExp);
+        vv2 = v_ne(v_and(vx_load(src + i*3 + 2*npixels), vmaskExp), vmaskExp);
+
+        v_uint8 velems = v_pack_b(vv0, vv1, vv2, z);
+
+        // 2nd arg is useless
+        v_uint8 vsh1 = v_extract<1>(velems, velems);
+        v_uint8 vsh2 = v_extract<2>(velems, velems);
+
+        v_uint8 vres3 = v_and(v_and(velems, vsh1), vsh2);
+        for (int j = 0; j < npixels; j++)
+        {
+            dst[i + j] = v_get0(vres3);
+            // 2nd arg is useless
+            vres3 = v_extract<3>(vres3, vres3);
+        }
+    }
+
+    return i;
+}
+
+
+template <>
+int finiteMaskSIMD_<float, 4>(const float *fsrc, uchar *dst, size_t utotal)
+{
+    const uint32_t* src = (const uint32_t*)fsrc;
+    const int npixels = VTraits<v_uint8>::vlanes() / 2;
+    const int nfloats = VTraits<v_uint32>::vlanes();
+    const v_uint32 vMaskExp = vx_setall_u32(0x7f800000);
+    v_uint32 vmaskAll4 = vx_setall_u32(0xFFFFFFFF);
+
+    int i = 0;
+    int total = (int)utotal;
+    for(; i < total - npixels + 1; i += npixels )
+    {
+        v_uint32 v0 = vx_load(src + i * 4 + 0*nfloats);
+        v_uint32 v1 = vx_load(src + i * 4 + 1*nfloats);
+        v_uint32 v2 = vx_load(src + i * 4 + 2*nfloats);
+        v_uint32 v3 = vx_load(src + i * 4 + 3*nfloats);
+        v_uint32 v4 = vx_load(src + i * 4 + 4*nfloats);
+        v_uint32 v5 = vx_load(src + i * 4 + 5*nfloats);
+        v_uint32 v6 = vx_load(src + i * 4 + 6*nfloats);
+        v_uint32 v7 = vx_load(src + i * 4 + 7*nfloats);
+
+        v_uint32 vmask0 = v_ne(v_and(v0, vMaskExp), vMaskExp);
+        v_uint32 vmask1 = v_ne(v_and(v1, vMaskExp), vMaskExp);
+        v_uint32 vmask2 = v_ne(v_and(v2, vMaskExp), vMaskExp);
+        v_uint32 vmask3 = v_ne(v_and(v3, vMaskExp), vMaskExp);
+        v_uint32 vmask4 = v_ne(v_and(v4, vMaskExp), vMaskExp);
+        v_uint32 vmask5 = v_ne(v_and(v5, vMaskExp), vMaskExp);
+        v_uint32 vmask6 = v_ne(v_and(v6, vMaskExp), vMaskExp);
+        v_uint32 vmask7 = v_ne(v_and(v7, vMaskExp), vMaskExp);
+
+        v_uint8 velems0 = v_pack_b(vmask0, vmask1, vmask2, vmask3);
+        v_uint8 velems1 = v_pack_b(vmask4, vmask5, vmask6, vmask7);
+
+        v_uint32 vresWide0 = v_eq(v_reinterpret_as_u32(velems0), vmaskAll4);
+        v_uint32 vresWide1 = v_eq(v_reinterpret_as_u32(velems1), vmaskAll4);
+
+        // last 2 args are useless
+        v_uint8 vres = v_pack_b(vresWide0, vresWide1, vresWide0, vresWide1);
+
+        v_store_low(dst + i, vres);
+    }
+
+    return i;
+}
+
+
+template <>
+int finiteMaskSIMD_<double, 1>(const double *dsrc, uchar *dst, size_t utotal)
+{
+    const uint64_t* src = (const uint64_t*)dsrc;
+    const int size8 = VTraits<v_uint8>::vlanes();
+    int i = 0;
+    int total = (int)utotal;
+
+    v_uint64 vmaskExp = vx_setall_u64(0x7ff0000000000000);
+    v_uint64 z = vx_setzero_u64();
+
+    for(; i < total - (size8 / 2) + 1; i += (size8 / 2) )
+    {
+        v_uint64 vv0, vv1, vv2, vv3;
+        vv0 = v_ne(v_and(vx_load(src + i                ), vmaskExp), vmaskExp);
+        vv1 = v_ne(v_and(vx_load(src + i +   (size8 / 8)), vmaskExp), vmaskExp);
+        vv2 = v_ne(v_and(vx_load(src + i + 2*(size8 / 8)), vmaskExp), vmaskExp);
+        vv3 = v_ne(v_and(vx_load(src + i + 3*(size8 / 8)), vmaskExp), vmaskExp);
+
+        v_uint8 v = v_pack_b(vv0, vv1, vv2, vv3, z, z, z, z);
+
+        v_store_low(dst + i, v);
+    }
+
+    return i;
+}
+
+template <>
+int finiteMaskSIMD_<double, 2>(const double *dsrc, uchar *dst, size_t utotal)
+{
+    const uint64_t* src = (const uint64_t*)dsrc;
+    const int npixels = VTraits<v_uint8>::vlanes() / 2;
+    const int ndoubles = VTraits<v_uint64>::vlanes();
+    v_uint64 vmaskExp = vx_setall_u64(0x7ff0000000000000);
+    v_uint16 vmaskBoth = vx_setall_u16(0xffff);
+
+    int i = 0;
+    int total = (int)utotal;
+    for(; i < total - npixels + 1; i += npixels )
+    {
+        v_uint64 vv0 = v_ne(v_and(vx_load(src + i*2 + 0*ndoubles), vmaskExp), vmaskExp);
+        v_uint64 vv1 = v_ne(v_and(vx_load(src + i*2 + 1*ndoubles), vmaskExp), vmaskExp);
+        v_uint64 vv2 = v_ne(v_and(vx_load(src + i*2 + 2*ndoubles), vmaskExp), vmaskExp);
+        v_uint64 vv3 = v_ne(v_and(vx_load(src + i*2 + 3*ndoubles), vmaskExp), vmaskExp);
+        v_uint64 vv4 = v_ne(v_and(vx_load(src + i*2 + 4*ndoubles), vmaskExp), vmaskExp);
+        v_uint64 vv5 = v_ne(v_and(vx_load(src + i*2 + 5*ndoubles), vmaskExp), vmaskExp);
+        v_uint64 vv6 = v_ne(v_and(vx_load(src + i*2 + 6*ndoubles), vmaskExp), vmaskExp);
+        v_uint64 vv7 = v_ne(v_and(vx_load(src + i*2 + 7*ndoubles), vmaskExp), vmaskExp);
+
+        v_uint8 velems0 = v_pack_b(vv0, vv1, vv2, vv3, vv4, vv5, vv6, vv7);
+
+        v_uint16 vfinite0 = v_eq(v_reinterpret_as_u16(velems0), vmaskBoth);
+
+        // 2nd arg is useless
+        v_uint8 vres = v_pack(vfinite0, vfinite0);
+        v_store_low(dst + i, vres);
+    }
+
+    return i;
+}
+
+
+template <>
+int finiteMaskSIMD_<double, 3>(const double *dsrc, uchar *dst, size_t utotal)
+{
+    //TODO: vectorize it properly
+
+    const uint64_t* src = (const uint64_t*)dsrc;
+    const int npixels = VTraits<v_uint8>::vlanes() / 2;
+    uint64_t maskExp = 0x7ff0000000000000;
+
+    int i = 0;
+    int total = (int)utotal;
+    for(; i < total - npixels + 1; i += npixels )
+    {
+        for (int j = 0; j < npixels; j++)
+        {
+            uint64_t val0 = src[i * 3 + j * 3 + 0];
+            uint64_t val1 = src[i * 3 + j * 3 + 1];
+            uint64_t val2 = src[i * 3 + j * 3 + 2];
+
+            bool finite = ((val0 & maskExp) != maskExp) &&
+                          ((val1 & maskExp) != maskExp) &&
+                          ((val2 & maskExp) != maskExp);
+
+            dst[i + j] = finite ? 255 : 0;
+        }
+    }
+
+    return i;
+}
+
+template <>
+int finiteMaskSIMD_<double, 4>(const double *dsrc, uchar *dst, size_t utotal)
+{
+    //TODO: vectorize it properly
+
+    uint64_t* src = (uint64_t*)dsrc;
+    const int npixels = VTraits<v_uint8>::vlanes() / 2;
+    const int ndoubles = VTraits<v_uint64>::vlanes();
+    v_uint16 vmaskExp16 = vx_setall_u16(0x7ff0);
+    v_uint32 z = vx_setzero_u32();
+
+    int i = 0;
+    int total = (int)utotal;
+    for(; i < total - npixels + 1; i += npixels )
+    {
+        v_uint16 vexpb0, vexpb1, vexpb2, vexpb3, vexpb4, vexpb5, vexpb6, vexpb7;
+        v_uint16 dummy;
+        v_load_deinterleave((uint16_t*)(src + 0*4*ndoubles), dummy, dummy, dummy, vexpb0);
+        v_load_deinterleave((uint16_t*)(src + 1*4*ndoubles), dummy, dummy, dummy, vexpb1);
+        v_load_deinterleave((uint16_t*)(src + 2*4*ndoubles), dummy, dummy, dummy, vexpb2);
+        v_load_deinterleave((uint16_t*)(src + 3*4*ndoubles), dummy, dummy, dummy, vexpb3);
+
+        v_uint16 vcmp0 = v_eq(v_and(vexpb0, vmaskExp16), vmaskExp16);
+        v_uint16 vcmp1 = v_eq(v_and(vexpb1, vmaskExp16), vmaskExp16);
+        v_uint16 vcmp2 = v_eq(v_and(vexpb2, vmaskExp16), vmaskExp16);
+        v_uint16 vcmp3 = v_eq(v_and(vexpb3, vmaskExp16), vmaskExp16);
+
+        v_uint8 velems0 = v_pack(vcmp0, vcmp1);
+        v_uint8 velems1 = v_pack(vcmp2, vcmp3);
+
+        v_uint32 vResWide0 = v_eq(v_reinterpret_as_u32(velems0), z);
+        v_uint32 vResWide1 = v_eq(v_reinterpret_as_u32(velems1), z);
+
+        v_uint16 vp16 = v_pack(vResWide0, vResWide1);
+
+        // 2nd arg is useless
+        v_uint8 vres = v_pack(vp16, vp16);
+        v_store_low(dst, vres);
+
+        src += npixels * 4;
+        dst += npixels;
+    }
+
+    return i;
+}
+
+#endif
+
+
+template <typename _Tp, int cn>
+void finiteMask_(const uchar *src, uchar *dst, size_t total)
+{
+    CV_INSTRUMENT_REGION();
+    size_t i = 0;
+    const _Tp* tsrc = (const _Tp*) src;
+
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    i = finiteMaskSIMD_<_Tp, cn>(tsrc, dst, total);
+#endif
+
+    for(; i < total; i++ )
+    {
+        bool finite = true;
+        for (int c = 0; c < cn; c++)
+        {
+            _Tp val = tsrc[i * cn + c];
+            finite = finite && !cvIsNaN(val) && !cvIsInf(val);
+        }
+        dst[i] = finite ? 255 : 0;
+    }
+}
+
+FiniteMaskFunc getFiniteMaskFunc(bool isDouble, int cn)
+{
+    static FiniteMaskFunc tab[] =
+    {
+        (FiniteMaskFunc)GET_OPTIMIZED((finiteMask_<float,  1>)),
+        (FiniteMaskFunc)GET_OPTIMIZED((finiteMask_<float,  2>)),
+        (FiniteMaskFunc)GET_OPTIMIZED((finiteMask_<float,  3>)),
+        (FiniteMaskFunc)GET_OPTIMIZED((finiteMask_<float,  4>)),
+        (FiniteMaskFunc)GET_OPTIMIZED((finiteMask_<double, 1>)),
+        (FiniteMaskFunc)GET_OPTIMIZED((finiteMask_<double, 2>)),
+        (FiniteMaskFunc)GET_OPTIMIZED((finiteMask_<double, 3>)),
+        (FiniteMaskFunc)GET_OPTIMIZED((finiteMask_<double, 4>)),
+    };
+
+    int idx = (isDouble ? 4 : 0) + cn - 1;
+    return tab[idx];
+}
+
+#endif
+CV_CPU_OPTIMIZATION_NAMESPACE_END
+} // namespace cv
--- a/modules/core/src/opencl/finitemask.cl
+++ b/modules/core/src/opencl/finitemask.cl
@ -0,0 +1,44 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
+// This kernel is compiled with the following possible defines:
+//  - srcT, cn: source type and number of channels per pixel
+//  - rowsPerWI: Intel GPU optimization
+//  - DOUBLE_SUPPORT: enable double support if available
+
+#ifdef DOUBLE_SUPPORT
+#ifdef cl_amd_fp64
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#elif defined cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+#endif
+#endif
+
+__kernel void finiteMask(__global const uchar * srcptr, int srcstep, int srcoffset,
+                         __global uchar * dstptr, int dststep, int dstoffset,
+                         int rows, int cols )
+{
+    int x = get_global_id(0);
+    int y0 = get_global_id(1) * rowsPerWI;
+
+    if (x < cols)
+    {
+        int src_index = mad24(y0, srcstep, mad24(x, (int)sizeof(srcT) * cn, srcoffset));
+        int dst_index = mad24(y0, dststep, x + dstoffset);
+
+        for (int y = y0, y1 = min(rows, y0 + rowsPerWI); y < y1; ++y, src_index += srcstep, dst_index += dststep)
+        {
+            bool vfinite = true;
+
+            for (int c = 0; c < cn; c++)
+            {
+                srcT val = *(__global srcT *)(srcptr + src_index + c * (int)sizeof(srcT));
+
+                vfinite = vfinite && !isnan(val) & !isinf(val);
+            }
+
+            *(dstptr + dst_index) = vfinite ? 255 : 0;
+        }
+    }
+}
--- a/modules/core/test/ocl/test_arithm.cpp
+++ b/modules/core/test/ocl/test_arithm.cpp
@ -1699,8 +1699,36 @@ OCL_TEST_P(ScaleAdd, Mat)

 //////////////////////////////// PatchNans ////////////////////////////////////////////////

-PARAM_TEST_CASE(PatchNaNs, Channels, bool)
+template<typename _Tp>
+_Tp randomNan(RNG& rng);
+
+template<>
+float randomNan(RNG& rng)
+{
+    uint32_t r = rng.next();
+    Cv32suf v;
+    v.u = r;
+    // exp & set a bit to avoid zero mantissa
+    v.u = v.u | 0x7f800001;
+    return v.f;
+}
+
+template<>
+double randomNan(RNG& rng)
+{
+    uint32_t r0 = rng.next();
+    uint32_t r1 = rng.next();
+    Cv64suf v;
+    v.u = (uint64_t(r0) << 32) | uint64_t(r1);
+    // exp &set a bit to avoid zero mantissa
+    v.u = v.u | 0x7ff0000000000001;
+    return v.f;
+}
+
+
+PARAM_TEST_CASE(PatchNaNs, MatDepth, Channels, bool)
 {
+    int ftype;
    int cn;
    bool use_roi;
    double value;
@ -1709,13 +1737,14 @@ PARAM_TEST_CASE(PatchNaNs, Channels, bool)

    virtual void SetUp()
    {
-        cn = GET_PARAM(0);
-        use_roi = GET_PARAM(1);
+        ftype = GET_PARAM(0);
+        cn = GET_PARAM(1);
+        use_roi = GET_PARAM(2);
    }

    void generateTestData()
    {
-        const int type = CV_MAKE_TYPE(CV_32F, cn);
+        const int type = CV_MAKE_TYPE(ftype, cn);

        Size roiSize = randomSize(1, 10);
        Border srcBorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
@ -1725,9 +1754,19 @@ PARAM_TEST_CASE(PatchNaNs, Channels, bool)
        roiSize.width *= cn;
        for (int y = 0; y < roiSize.height; ++y)
        {
-            float * const ptr = src_roi.ptr<float>(y);
+            float  *const ptrf = src_roi.ptr<float >(y);
+            double *const ptrd = src_roi.ptr<double>(y);
            for (int x = 0; x < roiSize.width; ++x)
-                ptr[x] = randomInt(-1, 1) == 0 ? std::numeric_limits<float>::quiet_NaN() : ptr[x];
+            {
+                if (ftype == CV_32F)
+                {
+                    ptrf[x] = randomInt(-1, 1) == 0 ? randomNan<float >(rng) : ptrf[x];
+                }
+                else if (ftype == CV_64F)
+                {
+                    ptrd[x] = randomInt(-1, 1) == 0 ? randomNan<double>(rng) : ptrd[x];
+                }
+            }
        }

        value = randomDouble(-100, 100);
@ -1754,6 +1793,84 @@ OCL_TEST_P(PatchNaNs, Mat)
    }
 }

+//////////////////////////////// finiteMask /////////////////////////////////////////////
+
+
+PARAM_TEST_CASE(FiniteMask, MatDepth, Channels, bool)
+{
+    int ftype;
+    int cn;
+    bool use_roi;
+
+    TEST_DECLARE_INPUT_PARAMETER(src);
+    TEST_DECLARE_OUTPUT_PARAMETER(mask);
+
+    virtual void SetUp()
+    {
+        ftype = GET_PARAM(0);
+        cn = GET_PARAM(1);
+        use_roi = GET_PARAM(2);
+    }
+
+    void generateTestData()
+    {
+        const int type = CV_MAKE_TYPE(ftype, cn);
+
+        Size roiSize = randomSize(1, MAX_VALUE);
+        Border srcBorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
+        randomSubMat(src, src_roi, roiSize, srcBorder, type, -40, 40);
+
+        randomSubMat(mask, mask_roi, roiSize, srcBorder, CV_8UC1, 5, 16);
+
+        // generating NaNs
+        const softfloat  fpinf = softfloat ::inf();
+        const softfloat  fninf = softfloat ::inf().setSign(true);
+        const softdouble dpinf = softdouble::inf();
+        const softdouble dninf = softdouble::inf().setSign(true);
+        for (int y = 0; y < roiSize.height; ++y)
+        {
+            float  *const ptrf = src_roi.ptr<float >(y);
+            double *const ptrd = src_roi.ptr<double>(y);
+            for (int x = 0; x < roiSize.width * cn; ++x)
+            {
+                int rem = randomInt(0, 10);
+                if (ftype == CV_32F)
+                {
+                    ptrf[x] = rem <  4 ? randomNan<float >(rng) :
+                              rem == 5 ? (float )((x + y)%2 ? fpinf : fninf) : ptrf[x];
+                }
+                else if (ftype == CV_64F)
+                {
+                    ptrd[x] = rem <  4 ? randomNan<double>(rng) :
+                              rem == 5 ? (double)((x + y)%2 ? dpinf : dninf) : ptrd[x];
+                }
+            }
+        }
+
+        UMAT_UPLOAD_INPUT_PARAMETER(src);
+        UMAT_UPLOAD_OUTPUT_PARAMETER(mask);
+    }
+
+    void Near()
+    {
+        OCL_EXPECT_MATS_NEAR(mask, 0);
+    }
+};
+
+OCL_TEST_P(FiniteMask, Mat)
+{
+    for (int j = 0; j < test_loop_times; j++)
+    {
+        generateTestData();
+
+        OCL_OFF(cv::finiteMask(src_roi, mask_roi));
+        OCL_ON(cv::finiteMask(usrc_roi, umask_roi));
+
+        Near();
+    }
+}
+
+
 //////////////////////////////// Psnr ////////////////////////////////////////////////

 typedef ArithmTestBase Psnr;
@ -1928,7 +2045,8 @@ OCL_INSTANTIATE_TEST_CASE_P(Arithm, InRange, Combine(OCL_ALL_DEPTHS, OCL_ALL_CHA
 OCL_INSTANTIATE_TEST_CASE_P(Arithm, ConvertScaleAbs, Combine(OCL_ALL_DEPTHS, OCL_ALL_CHANNELS, Bool()));
 OCL_INSTANTIATE_TEST_CASE_P(Arithm, ConvertFp16, Combine(OCL_ALL_CHANNELS, Bool()));
 OCL_INSTANTIATE_TEST_CASE_P(Arithm, ScaleAdd, Combine(OCL_ALL_DEPTHS, OCL_ALL_CHANNELS, Bool()));
-OCL_INSTANTIATE_TEST_CASE_P(Arithm, PatchNaNs, Combine(OCL_ALL_CHANNELS, Bool()));
+OCL_INSTANTIATE_TEST_CASE_P(Arithm, PatchNaNs, Combine(::testing::Values(CV_32F, CV_64F), OCL_ALL_CHANNELS, Bool()));
+OCL_INSTANTIATE_TEST_CASE_P(Arithm, FiniteMask, Combine(::testing::Values(CV_32F, CV_64F), OCL_ALL_CHANNELS, Bool()));
 OCL_INSTANTIATE_TEST_CASE_P(Arithm, Psnr, Combine(::testing::Values((MatDepth)CV_8U), OCL_ALL_CHANNELS, Bool()));
 OCL_INSTANTIATE_TEST_CASE_P(Arithm, UMatDot, Combine(OCL_ALL_DEPTHS, OCL_ALL_CHANNELS, Bool()));

--- a/modules/core/test/test_arithm.cpp
+++ b/modules/core/test/test_arithm.cpp
@ -729,6 +729,88 @@ struct InRangeOp : public BaseArithmOp
    }
 };

+namespace reference {
+
+template<typename _Tp>
+struct SoftType;
+
+template<>
+struct SoftType<float>
+{
+    typedef softfloat type;
+};
+
+template<>
+struct SoftType<double>
+{
+    typedef softdouble type;
+};
+
+
+template <typename _Tp>
+static void finiteMask_(const _Tp *src, uchar *dst, size_t total, int cn)
+{
+    for(size_t i = 0; i < total; i++ )
+    {
+        bool good = true;
+        for (int c = 0; c < cn; c++)
+        {
+            _Tp val = src[i * cn + c];
+            typename SoftType<_Tp>::type sval(val);
+
+            good = good && !sval.isNaN() && !sval.isInf();
+        }
+        dst[i] = good ? 255 : 0;
+    }
+}
+
+static void finiteMask(const Mat& src, Mat& dst)
+{
+    dst.create(src.dims, &src.size[0], CV_8UC1);
+
+    const Mat *arrays[]={&src, &dst, 0};
+    Mat planes[2];
+    NAryMatIterator it(arrays, planes);
+    size_t total = planes[0].total();
+    size_t i, nplanes = it.nplanes;
+    int depth = src.depth(), cn = src.channels();
+
+    for( i = 0; i < nplanes; i++, ++it )
+    {
+        const uchar* sptr = planes[0].ptr();
+        uchar* dptr = planes[1].ptr();
+
+        switch( depth )
+        {
+        case CV_32F: finiteMask_<float >((const  float*)sptr, dptr, total, cn); break;
+        case CV_64F: finiteMask_<double>((const double*)sptr, dptr, total, cn); break;
+        }
+    }
+}
+}
+
+
+struct FiniteMaskOp : public BaseElemWiseOp
+{
+    FiniteMaskOp() : BaseElemWiseOp(1, 0, 1, 1, Scalar::all(0)) {}
+    void op(const vector<Mat>& src, Mat& dst, const Mat&)
+    {
+        cv::finiteMask(src[0], dst);
+    }
+    void refop(const vector<Mat>& src, Mat& dst, const Mat&)
+    {
+        reference::finiteMask(src[0], dst);
+    }
+    int getRandomType(RNG& rng)
+    {
+        return cvtest::randomType(rng, _OutputArray::DEPTH_MASK_FLT, 1, 4);
+    }
+    double getMaxErr(int)
+    {
+        return 0;
+    }
+};
+

 struct ConvertScaleOp : public BaseElemWiseOp
 {
@ -1573,6 +1655,8 @@ INSTANTIATE_TEST_CASE_P(Core_CmpS, ElemWiseTest, ::testing::Values(ElemWiseOpPtr
 INSTANTIATE_TEST_CASE_P(Core_InRangeS, ElemWiseTest, ::testing::Values(ElemWiseOpPtr(new InRangeSOp)));
 INSTANTIATE_TEST_CASE_P(Core_InRange, ElemWiseTest, ::testing::Values(ElemWiseOpPtr(new InRangeOp)));

+INSTANTIATE_TEST_CASE_P(Core_FiniteMask, ElemWiseTest, ::testing::Values(ElemWiseOpPtr(new FiniteMaskOp)));
+
 INSTANTIATE_TEST_CASE_P(Core_Flip, ElemWiseTest, ::testing::Values(ElemWiseOpPtr(new FlipOp)));
 INSTANTIATE_TEST_CASE_P(Core_Transpose, ElemWiseTest, ::testing::Values(ElemWiseOpPtr(new TransposeOp)));
 INSTANTIATE_TEST_CASE_P(Core_SetIdentity, ElemWiseTest, ::testing::Values(ElemWiseOpPtr(new SetIdentityOp)));
@ -2876,4 +2960,76 @@ TEST(Core_CartPolar, inplace)
    EXPECT_THROW(cv::cartToPolar(uA[0], uA[1], uA[0], uA[1]), cv::Exception);
 }

+
+// Check different values for finiteMask()
+
+template<typename _Tp>
+_Tp randomNan(RNG& rng);
+
+template<>
+float randomNan(RNG& rng)
+{
+    uint32_t r = rng.next();
+    Cv32suf v;
+    v.u = r;
+    // exp & set a bit to avoid zero mantissa
+    v.u = v.u | 0x7f800001;
+    return v.f;
+}
+
+template<>
+double randomNan(RNG& rng)
+{
+    uint32_t r0 = rng.next();
+    uint32_t r1 = rng.next();
+    Cv64suf v;
+    v.u = (uint64_t(r0) << 32) | uint64_t(r1);
+    // exp &set a bit to avoid zero mantissa
+    v.u = v.u | 0x7ff0000000000001;
+    return v.f;
+}
+
+template<typename T>
+Mat generateFiniteMaskData(int cn, RNG& rng)
+{
+    typedef typename reference::SoftType<T>::type SFT;
+
+    SFT pinf = SFT::inf();
+    SFT ninf = SFT::inf().setSign(true);
+
+    const int len = 100;
+    Mat_<T> plainData(1, cn*len);
+    for(int i = 0; i < cn*len; i++)
+    {
+        int r = rng.uniform(0, 3);
+        plainData(i) = r == 0 ? T(rng.uniform(0, 2) ? pinf : ninf) :
+                       r == 1 ? randomNan<T>(rng) : T(0);
+    }
+
+    return Mat(plainData).reshape(cn);
+}
+
+typedef std::tuple<int, int> FiniteMaskFixtureParams;
+class FiniteMaskFixture : public ::testing::TestWithParam<FiniteMaskFixtureParams> {};
+
+TEST_P(FiniteMaskFixture, flags)
+{
+    auto p = GetParam();
+    int depth = get<0>(p);
+    int channels = get<1>(p);
+
+    RNG rng((uint64)ARITHM_RNG_SEED);
+    Mat data = (depth == CV_32F) ? generateFiniteMaskData<float >(channels, rng)
+                  /* CV_64F */   : generateFiniteMaskData<double>(channels, rng);
+
+    Mat nans, gtNans;
+    cv::finiteMask(data, nans);
+    reference::finiteMask(data, gtNans);
+
+    EXPECT_MAT_NEAR(nans, gtNans, 0);
+}
+
+// Params are: depth, channels 1 to 4
+INSTANTIATE_TEST_CASE_P(Core_FiniteMask, FiniteMaskFixture, ::testing::Combine(::testing::Values(CV_32F, CV_64F), ::testing::Range(1, 5)));
+
 }} // namespace
--- a/modules/core/test/test_precomp.hpp
+++ b/modules/core/test/test_precomp.hpp
@ -8,5 +8,6 @@
 #include "opencv2/ts/ocl_test.hpp"
 #include "opencv2/core/private.hpp"
 #include "opencv2/core/hal/hal.hpp"
+#include "opencv2/core/softfloat.hpp"

 #endif