From 53aad98a1a12ba01bbcb1e8e73f854128a7744c0 Mon Sep 17 00:00:00 2001 From: Rostislav Vasilikhin Date: Thu, 9 Nov 2023 08:32:47 +0100 Subject: [PATCH] Merge pull request #23098 from savuor:nanMask finiteMask() and doubles for patchNaNs() #23098 Related to #22826 Connected PR in extra: [#1037@extra](https://github.com/opencv/opencv_extra/pull/1037) ### TODOs: - [ ] Vectorize `finiteMask()` for 64FC3 and 64FC4 ### Changes This PR: * adds a new function `finiteMask()` * extends `patchNaNs()` by CV_64F support * moves `patchNaNs()` and `finiteMask()` to a separate file **NOTE:** now the function is called `finiteMask()` as discussed with the OpenCV core team ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [x] The PR is proposed to the proper branch - [x] There is a reference to the original bug report and related work - [x] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [x] The feature is well documented and sample code can be built with the project CMake --- modules/3d/perf/perf_tsdf.cpp | 17 +- modules/3d/src/rgbd/odometry_functions.cpp | 14 +- modules/3d/test/test_odometry.cpp | 10 +- modules/3d/test/test_tsdf.cpp | 49 +-- modules/core/CMakeLists.txt | 1 + modules/core/include/opencv2/core.hpp | 13 +- modules/core/perf/opencl/perf_arithm.cpp | 97 ++++- modules/core/perf/perf_arithm.cpp | 132 +++++++ modules/core/src/mathfuncs.cpp | 70 +--- modules/core/src/nan_mask.dispatch.cpp | 152 +++++++ modules/core/src/nan_mask.simd.hpp | 440 +++++++++++++++++++++ modules/core/src/opencl/finitemask.cl | 44 +++ modules/core/test/ocl/test_arithm.cpp | 132 ++++++- modules/core/test/test_arithm.cpp | 156 ++++++++ modules/core/test/test_precomp.hpp | 1 + 15 files changed, 1190 insertions(+), 138 deletions(-) create mode 100644 modules/core/src/nan_mask.dispatch.cpp create mode 100644 modules/core/src/nan_mask.simd.hpp create mode 100644 modules/core/src/opencl/finitemask.cl diff --git a/modules/3d/perf/perf_tsdf.cpp b/modules/3d/perf/perf_tsdf.cpp index 953f09ddf9..121ec5feb1 100644 --- a/modules/3d/perf/perf_tsdf.cpp +++ b/modules/3d/perf/perf_tsdf.cpp @@ -302,6 +302,9 @@ void renderPointsNormals(InputArray _points, InputArray _normals, OutputArray im Mat_ img = image.getMat(); + Mat goods; + finiteMask(points, goods); + Range range(0, sz.height); const int nstripes = -1; parallel_for_(range, [&](const Range&) @@ -311,6 +314,7 @@ void renderPointsNormals(InputArray _points, InputArray _normals, OutputArray im Vec4b* imgRow = img[y]; const ptype* ptsRow = points[y]; const ptype* nrmRow = normals[y]; + const uchar* goodRow = goods.ptr(y); for (int x = 0; x < sz.width; x++) { @@ -319,7 +323,7 @@ void renderPointsNormals(InputArray _points, InputArray _normals, OutputArray im Vec4b color; - if (cvIsNaN(p.x) || cvIsNaN(p.y) || cvIsNaN(p.z) ) + if ( !goodRow[x] ) { color = Vec4b(0, 32, 0, 0); } @@ -357,6 +361,11 @@ void renderPointsNormalsColors(InputArray _points, InputArray, InputArray _color Points points = _points.getMat(); Colors colors = _colors.getMat(); + Mat goods, goodp, goodc; + finiteMask(points, goodp); + finiteMask(colors, goodc); + goods = goodp & goodc; + Mat_ img = image.getMat(); Range range(0, sz.height); @@ -366,18 +375,16 @@ void renderPointsNormalsColors(InputArray _points, InputArray, InputArray _color for (int y = range.start; y < range.end; y++) { Vec4b* imgRow = img[y]; - const ptype* ptsRow = points[y]; const ptype* clrRow = colors[y]; + const uchar* goodRow = goods.ptr(y); for (int x = 0; x < sz.width; x++) { - Point3f p = fromPtype(ptsRow[x]); Point3f c = fromPtype(clrRow[x]); Vec4b color; - if (cvIsNaN(p.x) || cvIsNaN(p.y) || cvIsNaN(p.z) - || cvIsNaN(c.x) || cvIsNaN(c.y) || cvIsNaN(c.z)) + if ( !goodRow[x] ) { color = Vec4b(0, 32, 0, 0); } diff --git a/modules/3d/src/rgbd/odometry_functions.cpp b/modules/3d/src/rgbd/odometry_functions.cpp index 96d1c2fe80..d210e0a9ac 100644 --- a/modules/3d/src/rgbd/odometry_functions.cpp +++ b/modules/3d/src/rgbd/odometry_functions.cpp @@ -102,16 +102,8 @@ static void extendPyrMaskByPyrNormals(const std::vector& pyramidNormals, UMat maski = pyramidMask[i]; UMat normali = pyramidNormals[i]; UMat validNormalMask; - // NaN check - cv::compare(normali, normali, validNormalMask, CMP_EQ); - CV_Assert(validNormalMask.type() == CV_8UC4); - - std::vector channelMasks; - split(validNormalMask, channelMasks); - UMat tmpChMask; - cv::bitwise_and(channelMasks[0], channelMasks[1], tmpChMask); - cv::bitwise_and(channelMasks[2], tmpChMask, tmpChMask); - cv::bitwise_and(maski, tmpChMask, maski); + finiteMask(normali, validNormalMask); + cv::bitwise_and(maski, validNormalMask, maski); } } } @@ -727,7 +719,7 @@ void computeCorresps(const Matx33f& _K, const Mat& rt, { float ddst = depthDst_row[udst]; - if (maskDst_row[udst] && !cvIsNaN(ddst)) + if (maskDst_row[udst]) { float transformed_ddst = static_cast(ddst * (KRK_inv6_u1[udst] + KRK_inv7_v1_plus_KRK_inv8[vdst]) + ktinv.z); diff --git a/modules/3d/test/test_odometry.cpp b/modules/3d/test/test_odometry.cpp index 5e7365e205..a661495687 100644 --- a/modules/3d/test/test_odometry.cpp +++ b/modules/3d/test/test_odometry.cpp @@ -16,11 +16,8 @@ void dilateFrame(Mat& image, Mat& depth) CV_Assert(depth.type() == CV_32FC1); CV_Assert(depth.size() == image.size()); - Mat mask(image.size(), CV_8UC1, Scalar(255)); - for(int y = 0; y < depth.rows; y++) - for(int x = 0; x < depth.cols; x++) - if(cvIsNaN(depth.at(y,x)) || depth.at(y,x) > 10 || depth.at(y,x) <= FLT_EPSILON) - mask.at(y,x) = 0; + Mat mask; + inRange(depth, FLT_EPSILON, 10, mask); image.setTo(255, ~mask); Mat minImage; @@ -726,7 +723,8 @@ TEST(RGBD_Odometry_WarpFrame, nansAreMasked) ASSERT_EQ(0, rgbDiff); - Mat goodVals = (w.warpedDepth == w.warpedDepth); + Mat goodVals; + finiteMask(w.warpedDepth, goodVals); double l2diff = cv::norm(w.dstDepth, w.warpedDepth, NORM_L2, goodVals); double lidiff = cv::norm(w.dstDepth, w.warpedDepth, NORM_INF, goodVals); diff --git a/modules/3d/test/test_tsdf.cpp b/modules/3d/test/test_tsdf.cpp index 6e7dc0bf67..948149aa51 100644 --- a/modules/3d/test/test_tsdf.cpp +++ b/modules/3d/test/test_tsdf.cpp @@ -295,6 +295,9 @@ void renderPointsNormals(InputArray _points, InputArray _normals, OutputArray im Points points = _points.getMat(); Normals normals = _normals.getMat(); + Mat goods; + finiteMask(points, goods); + Mat_ img = image.getMat(); Range range(0, sz.height); @@ -306,6 +309,7 @@ void renderPointsNormals(InputArray _points, InputArray _normals, OutputArray im Vec4b* imgRow = img[y]; const ptype* ptsRow = points[y]; const ptype* nrmRow = normals[y]; + const uchar* goodRow = goods.ptr(y); for (int x = 0; x < sz.width; x++) { @@ -314,7 +318,7 @@ void renderPointsNormals(InputArray _points, InputArray _normals, OutputArray im Vec4b color; - if (cvIsNaN(p.x) || cvIsNaN(p.y) || cvIsNaN(p.z)) + if (!goodRow[x]) { color = Vec4b(0, 32, 0, 0); } @@ -352,6 +356,11 @@ void renderPointsNormalsColors(InputArray _points, InputArray, InputArray _color Points points = _points.getMat(); Colors colors = _colors.getMat(); + Mat goods, goodc, goodp; + finiteMask(points, goodp); + finiteMask(colors, goodc); + goods = goodp & goodc; + Mat_ img = image.getMat(); Range range(0, sz.height); @@ -361,18 +370,16 @@ void renderPointsNormalsColors(InputArray _points, InputArray, InputArray _color for (int y = range.start; y < range.end; y++) { Vec4b* imgRow = img[y]; - const ptype* ptsRow = points[y]; const ptype* clrRow = colors[y]; + const uchar* goodRow = goods.ptr(y); for (int x = 0; x < sz.width; x++) { - Point3f p = fromPtype(ptsRow[x]); Point3f c = fromPtype(clrRow[x]); Vec4b color; - if (cvIsNaN(p.x) || cvIsNaN(p.y) || cvIsNaN(p.z) - || cvIsNaN(c.x) || cvIsNaN(c.y) || cvIsNaN(c.z)) + if (!goodRow[x]) { color = Vec4b(0, 32, 0, 0); } @@ -587,33 +594,6 @@ void boundingBoxGrowthTest(bool enableGrowth) } -static Mat nanMask(Mat img) -{ - int depth = img.depth(); - Mat mask(img.size(), CV_8U); - for (int y = 0; y < img.rows; y++) - { - uchar *maskRow = mask.ptr(y); - if (depth == CV_32F) - { - Vec4f *imgrow = img.ptr(y); - for (int x = 0; x < img.cols; x++) - { - maskRow[x] = (imgrow[x] == imgrow[x]) * 255; - } - } - else if (depth == CV_64F) - { - Vec4d *imgrow = img.ptr(y); - for (int x = 0; x < img.cols; x++) - { - maskRow[x] = (imgrow[x] == imgrow[x]) * 255; - } - } - } - return mask; -} - template static Mat_ normalsErrorT(Mat_ srcNormals, Mat_ dstNormals) { @@ -725,8 +705,9 @@ void regressionVolPoseRot() split(uptsRot, ptsRotCh); Mat maskPts0 = ptsCh[2] > 0; Mat maskPtsRot = ptsRotCh[2] > 0; - Mat maskNrm0 = nanMask(mnrm); - Mat maskNrmRot = nanMask(mnrmRot); + Mat maskNrm0, maskNrmRot; + finiteMask(mnrm, maskNrm0); + finiteMask(mnrmRot, maskNrmRot); Mat maskPtsDiff, maskNrmDiff; cv::bitwise_xor(maskPts0, maskPtsRot, maskPtsDiff); cv::bitwise_xor(maskNrm0, maskNrmRot, maskNrmDiff); diff --git a/modules/core/CMakeLists.txt b/modules/core/CMakeLists.txt index 2835887def..a1611dfcf6 100644 --- a/modules/core/CMakeLists.txt +++ b/modules/core/CMakeLists.txt @@ -10,6 +10,7 @@ ocv_add_dispatched_file(has_non_zero SSE2 AVX2) ocv_add_dispatched_file(matmul SSE2 SSE4_1 AVX2 AVX512_SKX NEON_DOTPROD) ocv_add_dispatched_file(mean SSE2 AVX2) ocv_add_dispatched_file(merge SSE2 AVX2) +ocv_add_dispatched_file(nan_mask SSE2 AVX2) ocv_add_dispatched_file(split SSE2 AVX2) ocv_add_dispatched_file(sum SSE2 AVX2) diff --git a/modules/core/include/opencv2/core.hpp b/modules/core/include/opencv2/core.hpp index d76120238c..e91a7ab0fe 100644 --- a/modules/core/include/opencv2/core.hpp +++ b/modules/core/include/opencv2/core.hpp @@ -1697,12 +1697,21 @@ elements. CV_EXPORTS_W bool checkRange(InputArray a, bool quiet = true, CV_OUT Point* pos = 0, double minVal = -DBL_MAX, double maxVal = DBL_MAX); -/** @brief converts NaNs to the given number -@param a input/output matrix (CV_32F type). +/** @brief Replaces NaNs by given number +@param a input/output matrix (CV_32F or CV_64F type) @param val value to convert the NaNs */ CV_EXPORTS_W void patchNaNs(InputOutputArray a, double val = 0); +/** @brief Generates a mask of finite float values, i.e. not NaNs nor Infs. + +An element is set to to 255 (all 1-bits) if all channels are finite. +@param src Input matrix, should contain float or double elements of 1 to 4 channels +@param mask Output matrix of the same size as input of type CV_8UC1 +*/ +CV_EXPORTS_W void finiteMask(InputArray src, OutputArray mask); + + /** @brief Performs generalized matrix multiplication. The function cv::gemm performs generalized matrix multiplication similar to the diff --git a/modules/core/perf/opencl/perf_arithm.cpp b/modules/core/perf/opencl/perf_arithm.cpp index 8d1e7a6288..94af3a99cb 100644 --- a/modules/core/perf/opencl/perf_arithm.cpp +++ b/modules/core/perf/opencl/perf_arithm.cpp @@ -41,6 +41,7 @@ #include "../perf_precomp.hpp" #include "opencv2/ts/ocl_perf.hpp" +#include "opencv2/core/softfloat.hpp" #ifdef HAVE_OPENCL @@ -1038,14 +1039,40 @@ OCL_PERF_TEST_P(ConvertScaleAbsFixture, ConvertScaleAbs, ///////////// PatchNaNs //////////////////////// +template +_Tp randomNan(RNG& rng); + +template<> +float randomNan(RNG& rng) +{ + uint32_t r = rng.next(); + Cv32suf v; + v.u = r; + // exp & set a bit to avoid zero mantissa + v.u = v.u | 0x7f800001; + return v.f; +} + +template<> +double randomNan(RNG& rng) +{ + uint32_t r0 = rng.next(); + uint32_t r1 = rng.next(); + Cv64suf v; + v.u = (uint64_t(r0) << 32) | uint64_t(r1); + // exp &set a bit to avoid zero mantissa + v.u = v.u | 0x7ff0000000000001; + return v.f; +} + typedef Size_MatType PatchNaNsFixture; OCL_PERF_TEST_P(PatchNaNsFixture, PatchNaNs, - ::testing::Combine(OCL_TEST_SIZES, OCL_PERF_ENUM(CV_32FC1, CV_32FC4))) + ::testing::Combine(OCL_TEST_SIZES, OCL_PERF_ENUM(CV_32FC1, CV_32FC3, CV_32FC4, CV_64FC1, CV_64FC3, CV_64FC4))) { const Size_MatType_t params = GetParam(); Size srcSize = get<0>(params); - const int type = get<1>(params), cn = CV_MAT_CN(type); + const int type = get<1>(params), cn = CV_MAT_CN(type), depth = CV_MAT_DEPTH(type); checkDeviceMaxMemoryAllocSize(srcSize, type); @@ -1056,11 +1083,22 @@ OCL_PERF_TEST_P(PatchNaNsFixture, PatchNaNs, { Mat src_ = src.getMat(ACCESS_RW); srcSize.width *= cn; + RNG& rng = theRNG(); for (int y = 0; y < srcSize.height; ++y) { - float * const ptr = src_.ptr(y); + float *const ptrf = src_.ptr(y); + double *const ptrd = src_.ptr(y); for (int x = 0; x < srcSize.width; ++x) - ptr[x] = (x + y) % 2 == 0 ? std::numeric_limits::quiet_NaN() : ptr[x]; + { + if (depth == CV_32F) + { + ptrf[x] = (x + y) % 2 == 0 ? randomNan(rng) : ptrf[x]; + } + else if (depth == CV_64F) + { + ptrd[x] = (x + y) % 2 == 0 ? randomNan(rng) : ptrd[x]; + } + } } } @@ -1069,6 +1107,57 @@ OCL_PERF_TEST_P(PatchNaNsFixture, PatchNaNs, SANITY_CHECK(src); } +////////////// finiteMask //////////////////////// + +typedef Size_MatType FiniteMaskFixture; + +OCL_PERF_TEST_P(FiniteMaskFixture, FiniteMask, + ::testing::Combine(OCL_TEST_SIZES, OCL_PERF_ENUM(CV_32FC1, CV_32FC3, CV_32FC4, CV_64FC1, CV_64FC3, CV_64FC4))) +{ + const Size_MatType_t params = GetParam(); + Size srcSize = get<0>(params); + const int type = get<1>(params), cn = CV_MAT_CN(type), depth = CV_MAT_DEPTH(type); + + checkDeviceMaxMemoryAllocSize(srcSize, type); + + UMat src(srcSize, type); + UMat mask(srcSize, CV_8UC1); + declare.in(src, WARMUP_RNG).out(mask); + + // generating NaNs + { + Mat src_ = src.getMat(ACCESS_RW); + srcSize.width *= cn; + const softfloat fpinf = softfloat ::inf(); + const softfloat fninf = softfloat ::inf().setSign(true); + const softdouble dpinf = softdouble::inf(); + const softdouble dninf = softdouble::inf().setSign(true); + RNG& rng = theRNG(); + for (int y = 0; y < srcSize.height; ++y) + { + float *const ptrf = src_.ptr(y); + double *const ptrd = src_.ptr(y); + for (int x = 0; x < srcSize.width; ++x) + { + int rem = (x + y) % 10; + if (depth == CV_32F) + { + ptrf[x] = rem < 4 ? randomNan(rng) : + rem == 5 ? (float )((x + y)%2 ? fpinf : fninf) : ptrf[x]; + } + else if (depth == CV_64F) + { + ptrd[x] = rem < 4 ? randomNan(rng) : + rem == 5 ? (double)((x + y)%2 ? dpinf : dninf) : ptrd[x]; + } + } + } + } + + OCL_TEST_CYCLE() cv::finiteMask(src, mask); + + SANITY_CHECK(mask); +} ///////////// ScaleAdd //////////////////////// diff --git a/modules/core/perf/perf_arithm.cpp b/modules/core/perf/perf_arithm.cpp index 872963fc65..c48e23a8f6 100644 --- a/modules/core/perf/perf_arithm.cpp +++ b/modules/core/perf/perf_arithm.cpp @@ -1,4 +1,5 @@ #include "perf_precomp.hpp" +#include "opencv2/core/softfloat.hpp" #include namespace opencv_test @@ -451,4 +452,135 @@ INSTANTIATE_TEST_CASE_P(/*nothing*/ , BinaryOpTest, ) ); + +///////////// PatchNaNs //////////////////////// + +template +_Tp randomNan(RNG& rng); + +template<> +float randomNan(RNG& rng) +{ + uint32_t r = rng.next(); + Cv32suf v; + v.u = r; + // exp & set a bit to avoid zero mantissa + v.u = v.u | 0x7f800001; + return v.f; +} + +template<> +double randomNan(RNG& rng) +{ + uint32_t r0 = rng.next(); + uint32_t r1 = rng.next(); + Cv64suf v; + v.u = (uint64_t(r0) << 32) | uint64_t(r1); + // exp &set a bit to avoid zero mantissa + v.u = v.u | 0x7ff0000000000001; + return v.f; +} + +typedef Size_MatType PatchNaNsFixture; + +PERF_TEST_P_(PatchNaNsFixture, PatchNaNs) +{ + const Size_MatType_t params = GetParam(); + Size srcSize = get<0>(params); + const int type = get<1>(params), cn = CV_MAT_CN(type), depth = CV_MAT_DEPTH(type); + + Mat src(srcSize, type); + declare.in(src, WARMUP_RNG).out(src); + + // generating NaNs + { + srcSize.width *= cn; + RNG& rng = theRNG(); + for (int y = 0; y < srcSize.height; ++y) + { + float *const ptrf = src.ptr(y); + double *const ptrd = src.ptr(y); + for (int x = 0; x < srcSize.width; ++x) + { + if (depth == CV_32F) + { + ptrf[x] = (x + y) % 2 == 0 ? randomNan(rng) : ptrf[x]; + } + else if (depth == CV_64F) + { + ptrd[x] = (x + y) % 2 == 0 ? randomNan(rng) : ptrd[x]; + } + } + } + } + + TEST_CYCLE() cv::patchNaNs(src, 17.7); + + SANITY_CHECK(src); +} + +INSTANTIATE_TEST_CASE_P(/*nothing*/ , PatchNaNsFixture, + testing::Combine( + testing::Values(szVGA, sz720p, sz1080p, sz2160p), + testing::Values(CV_32FC1, CV_32FC2, CV_32FC3, CV_32FC4, CV_64FC1, CV_64FC2, CV_64FC3, CV_64FC4) + ) +); + + +////////////// finiteMask //////////////////////// + +typedef Size_MatType FiniteMaskFixture; + +PERF_TEST_P_(FiniteMaskFixture, FiniteMask) +{ + const Size_MatType_t params = GetParam(); + Size srcSize = get<0>(params); + const int type = get<1>(params), cn = CV_MAT_CN(type), depth = CV_MAT_DEPTH(type); + + Mat src(srcSize, type); + Mat mask(srcSize, CV_8UC1); + declare.in(src, WARMUP_RNG).out(mask); + + // generating NaNs + { + srcSize.width *= cn; + const softfloat fpinf = softfloat ::inf(); + const softfloat fninf = softfloat ::inf().setSign(true); + const softdouble dpinf = softdouble::inf(); + const softdouble dninf = softdouble::inf().setSign(true); + RNG& rng = theRNG(); + for (int y = 0; y < srcSize.height; ++y) + { + float *const ptrf = src.ptr(y); + double *const ptrd = src.ptr(y); + for (int x = 0; x < srcSize.width; ++x) + { + int rem = (x + y) % 10; + if (depth == CV_32F) + { + ptrf[x] = rem < 4 ? randomNan(rng) : + rem == 5 ? (float )((x + y)%2 ? fpinf : fninf) : ptrf[x]; + } + else if (depth == CV_64F) + { + ptrd[x] = rem < 4 ? randomNan(rng) : + rem == 5 ? (double)((x + y)%2 ? dpinf : dninf) : ptrd[x]; + } + } + } + } + + TEST_CYCLE() cv::finiteMask(src, mask); + + SANITY_CHECK(mask); +} + +INSTANTIATE_TEST_CASE_P(/*nothing*/ , FiniteMaskFixture, + testing::Combine( + testing::Values(szVGA, sz720p, sz1080p, sz2160p), + testing::Values(CV_32FC1, CV_32FC2, CV_32FC3, CV_32FC4, CV_64FC1, CV_64FC2, CV_64FC3, CV_64FC4) + ) +); + + } // namespace diff --git a/modules/core/src/mathfuncs.cpp b/modules/core/src/mathfuncs.cpp index 8b1be9c342..56da605ada 100644 --- a/modules/core/src/mathfuncs.cpp +++ b/modules/core/src/mathfuncs.cpp @@ -1574,75 +1574,7 @@ bool checkRange(InputArray _src, bool quiet, Point* pt, double minVal, double ma return true; } -#ifdef HAVE_OPENCL - -static bool ocl_patchNaNs( InputOutputArray _a, float value ) -{ - int rowsPerWI = ocl::Device::getDefault().isIntel() ? 4 : 1; - ocl::Kernel k("KF", ocl::core::arithm_oclsrc, - format("-D UNARY_OP -D OP_PATCH_NANS -D dstT=float -D DEPTH_dst=%d -D rowsPerWI=%d", - CV_32F, rowsPerWI)); - if (k.empty()) - return false; - - UMat a = _a.getUMat(); - int cn = a.channels(); - - k.args(ocl::KernelArg::ReadOnlyNoSize(a), - ocl::KernelArg::WriteOnly(a, cn), (float)value); - - size_t globalsize[2] = { (size_t)a.cols * cn, ((size_t)a.rows + rowsPerWI - 1) / rowsPerWI }; - return k.run(2, globalsize, NULL, false); -} - -#endif - -void patchNaNs( InputOutputArray _a, double _val ) -{ - CV_INSTRUMENT_REGION(); - - CV_Assert( _a.depth() == CV_32F ); - - CV_OCL_RUN(_a.isUMat() && _a.dims() <= 2, - ocl_patchNaNs(_a, (float)_val)) - - Mat a = _a.getMat(); - const Mat* arrays[] = {&a, 0}; - int* ptrs[1] = {}; - NAryMatIterator it(arrays, (uchar**)ptrs); - size_t len = it.size*a.channels(); - Cv32suf val; - val.f = (float)_val; - -#if (CV_SIMD || CV_SIMD_SCALABLE) - v_int32 v_mask1 = vx_setall_s32(0x7fffffff), v_mask2 = vx_setall_s32(0x7f800000); - v_int32 v_val = vx_setall_s32(val.i); -#endif - - for( size_t i = 0; i < it.nplanes; i++, ++it ) - { - int* tptr = ptrs[0]; - size_t j = 0; - -#if (CV_SIMD || CV_SIMD_SCALABLE) - size_t cWidth = (size_t)VTraits::vlanes(); - for ( ; j + cWidth <= len; j += cWidth) - { - v_int32 v_src = vx_load(tptr + j); - v_int32 v_cmp_mask = v_lt(v_mask2, v_and(v_src, v_mask1)); - v_int32 v_dst = v_select(v_cmp_mask, v_val, v_src); - v_store(tptr + j, v_dst); - } - vx_cleanup(); -#endif - - for( ; j < len; j++ ) - if( (tptr[j] & 0x7fffffff) > 0x7f800000 ) - tptr[j] = val.i; - } -} - -} +} // namespace cv #ifndef OPENCV_EXCLUDE_C_API diff --git a/modules/core/src/nan_mask.dispatch.cpp b/modules/core/src/nan_mask.dispatch.cpp new file mode 100644 index 0000000000..6170316c6a --- /dev/null +++ b/modules/core/src/nan_mask.dispatch.cpp @@ -0,0 +1,152 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#include "precomp.hpp" +#include "opencl_kernels_core.hpp" + +#include "nan_mask.simd.hpp" +#include "nan_mask.simd_declarations.hpp" // defines CV_CPU_DISPATCH_MODES_ALL=AVX2,...,BASELINE based on CMakeLists.txt content + +namespace cv { + +#ifdef HAVE_OPENCL + +static bool ocl_patchNaNs( InputOutputArray _a, double value ) +{ + int ftype = _a.depth(); + + const ocl::Device d = ocl::Device::getDefault(); + bool doubleSupport = d.doubleFPConfig() > 0; + if (!doubleSupport && ftype == CV_64F) + { + return false; + } + + int rowsPerWI = ocl::Device::getDefault().isIntel() ? 4 : 1; + ocl::Kernel k("KF", ocl::core::arithm_oclsrc, + format("-D UNARY_OP -D OP_PATCH_NANS -D dstT=%s -D DEPTH_dst=%d -D rowsPerWI=%d %s", + ftype == CV_64F ? "double" : "float", ftype, rowsPerWI, + doubleSupport ? "-D DOUBLE_SUPPORT" : "")); + if (k.empty()) + return false; + + UMat a = _a.getUMat(); + int cn = a.channels(); + + // to pass float or double to args + if (ftype == CV_32F) + { + k.args(ocl::KernelArg::ReadOnlyNoSize(a), ocl::KernelArg::WriteOnly(a, cn), (float)value); + } + else // CV_64F + { + k.args(ocl::KernelArg::ReadOnlyNoSize(a), ocl::KernelArg::WriteOnly(a, cn), value); + } + + size_t globalsize[2] = { (size_t)a.cols * cn, ((size_t)a.rows + rowsPerWI - 1) / rowsPerWI }; + return k.run(2, globalsize, NULL, false); +} + +#endif + +static PatchNanFunc getPatchNanFunc(bool isDouble) +{ + CV_INSTRUMENT_REGION(); + CV_CPU_DISPATCH(getPatchNanFunc, (isDouble), CV_CPU_DISPATCH_MODES_ALL); +} + +void patchNaNs( InputOutputArray _a, double _val ) +{ + CV_INSTRUMENT_REGION(); + CV_Assert( _a.depth() == CV_32F || _a.depth() == CV_64F); + + CV_OCL_RUN(_a.isUMat() && _a.dims() <= 2, + ocl_patchNaNs(_a, _val)) + + Mat a = _a.getMat(); + const Mat* arrays[] = {&a, 0}; + uchar* ptrs[1] = {}; + NAryMatIterator it(arrays, ptrs); + size_t len = it.size*a.channels(); + + PatchNanFunc func = getPatchNanFunc(_a.depth() == CV_64F); + + for (size_t i = 0; i < it.nplanes; i++, ++it) + { + func(ptrs[0], len, _val); + } +} + + +#ifdef HAVE_OPENCL + +static bool ocl_finiteMask(const UMat img, UMat mask) +{ + int channels = img.channels(); + int depth = img.depth(); + + const ocl::Device d = ocl::Device::getDefault(); + bool doubleSupport = d.doubleFPConfig() > 0; + if (!doubleSupport && depth == CV_64F) + { + return false; + } + + int rowsPerWI = ocl::Device::getDefault().isIntel() ? 4 : 1; + ocl::Kernel k("finiteMask", ocl::core::finitemask_oclsrc, + format("-D srcT=%s -D cn=%d -D rowsPerWI=%d %s", + depth == CV_32F ? "float" : "double", channels, rowsPerWI, + doubleSupport ? "-D DOUBLE_SUPPORT" : "")); + if (k.empty()) + return false; + + k.args(ocl::KernelArg::ReadOnlyNoSize(img), ocl::KernelArg::WriteOnly(mask)); + + size_t globalsize[2] = { (size_t)img.cols, ((size_t)img.rows + rowsPerWI - 1) / rowsPerWI }; + return k.run(2, globalsize, NULL, false); +} + +#endif + +static FiniteMaskFunc getFiniteMaskFunc(bool isDouble, int cn) +{ + CV_INSTRUMENT_REGION(); + CV_CPU_DISPATCH(getFiniteMaskFunc, (isDouble, cn), CV_CPU_DISPATCH_MODES_ALL); +} + +void finiteMask(InputArray _src, OutputArray _mask) +{ + CV_INSTRUMENT_REGION(); + + int channels = _src.channels(); + int depth = _src.depth(); + CV_Assert( channels > 0 && channels <= 4); + CV_Assert( depth == CV_32F || depth == CV_64F ); + std::vector vsz(_src.dims()); + _src.sizend(vsz.data()); + _mask.create(_src.dims(), vsz.data(), CV_8UC1); + + CV_OCL_RUN(_src.isUMat() && _mask.isUMat() && _src.dims() <= 2, + ocl_finiteMask(_src.getUMat(), _mask.getUMat())); + + Mat src = _src.getMat(); + Mat mask = _mask.getMat(); + + const Mat *arrays[]={&src, &mask, 0}; + Mat planes[2]; + NAryMatIterator it(arrays, planes); + size_t total = planes[0].total(); + size_t i, nplanes = it.nplanes; + + FiniteMaskFunc func = getFiniteMaskFunc((depth == CV_64F), channels); + + for( i = 0; i < nplanes; i++, ++it ) + { + const uchar* sptr = planes[0].ptr(); + uchar* dptr = planes[1].ptr(); + + func(sptr, dptr, total); + } +} +} //namespace cv \ No newline at end of file diff --git a/modules/core/src/nan_mask.simd.hpp b/modules/core/src/nan_mask.simd.hpp new file mode 100644 index 0000000000..511862e26b --- /dev/null +++ b/modules/core/src/nan_mask.simd.hpp @@ -0,0 +1,440 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html + + +#include "precomp.hpp" + +namespace cv { + +typedef void (*PatchNanFunc)(uchar* tptr, size_t len, double newVal); +typedef void (*FiniteMaskFunc)(const uchar *src, uchar *dst, size_t total); + +CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN + +PatchNanFunc getPatchNanFunc(bool isDouble); +FiniteMaskFunc getFiniteMaskFunc(bool isDouble, int cn); + +#ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY + +static void patchNaNs_32f(uchar* ptr, size_t ulen, double newVal) +{ + CV_INSTRUMENT_REGION(); + + int32_t* tptr = (int32_t*)ptr; + Cv32suf val; + val.f = (float)newVal; + + int j = 0; + int len = (int)ulen; + +#if (CV_SIMD || CV_SIMD_SCALABLE) + v_int32 v_pos_mask = vx_setall_s32(0x7fffffff), v_exp_mask = vx_setall_s32(0x7f800000); + v_int32 v_val = vx_setall_s32(val.i); + + int cWidth = VTraits::vlanes(); + for (; j < len - cWidth*2 + 1; j += cWidth*2) + { + v_int32 v_src0 = vx_load(tptr + j); + v_int32 v_src1 = vx_load(tptr + j + cWidth); + + v_int32 v_cmp_mask0 = v_lt(v_exp_mask, v_and(v_src0, v_pos_mask)); + v_int32 v_cmp_mask1 = v_lt(v_exp_mask, v_and(v_src1, v_pos_mask)); + + if (v_check_any(v_or(v_cmp_mask0, v_cmp_mask1))) + { + v_int32 v_dst0 = v_select(v_cmp_mask0, v_val, v_src0); + v_int32 v_dst1 = v_select(v_cmp_mask1, v_val, v_src1); + + v_store(tptr + j, v_dst0); + v_store(tptr + j + cWidth, v_dst1); + } + } +#endif + + for (; j < len; j++) + { + if ((tptr[j] & 0x7fffffff) > 0x7f800000) + { + tptr[j] = val.i; + } + } +} + + +static void patchNaNs_64f(uchar* ptr, size_t ulen, double newVal) +{ + CV_INSTRUMENT_REGION(); + + int64_t* tptr = (int64_t*)ptr; + Cv64suf val; + val.f = newVal; + + int j = 0; + int len = (int)ulen; + +#if (CV_SIMD || CV_SIMD_SCALABLE) + v_int64 v_exp_mask = vx_setall_s64(0x7FF0000000000000); + v_int64 v_pos_mask = vx_setall_s64(0x7FFFFFFFFFFFFFFF); + v_int64 v_val = vx_setall_s64(val.i); + + int cWidth = VTraits::vlanes(); + for (; j < len - cWidth * 2 + 1; j += cWidth*2) + { + v_int64 v_src0 = vx_load(tptr + j); + v_int64 v_src1 = vx_load(tptr + j + cWidth); + + v_int64 v_cmp_mask0 = v_lt(v_exp_mask, v_and(v_src0, v_pos_mask)); + v_int64 v_cmp_mask1 = v_lt(v_exp_mask, v_and(v_src1, v_pos_mask)); + + if (v_check_any(v_cmp_mask0) || v_check_any(v_cmp_mask1)) + { + // v_select is not available for v_int64, emulating it + v_int32 val32 = v_reinterpret_as_s32(v_val); + v_int64 v_dst0 = v_reinterpret_as_s64(v_select(v_reinterpret_as_s32(v_cmp_mask0), val32, v_reinterpret_as_s32(v_src0))); + v_int64 v_dst1 = v_reinterpret_as_s64(v_select(v_reinterpret_as_s32(v_cmp_mask1), val32, v_reinterpret_as_s32(v_src1))); + + v_store(tptr + j, v_dst0); + v_store(tptr + j + cWidth, v_dst1); + } + } +#endif + + for (; j < len; j++) + if ((tptr[j] & 0x7FFFFFFFFFFFFFFF) > 0x7FF0000000000000) + tptr[j] = val.i; +} + +PatchNanFunc getPatchNanFunc(bool isDouble) +{ + return isDouble ? (PatchNanFunc)GET_OPTIMIZED(patchNaNs_64f) + : (PatchNanFunc)GET_OPTIMIZED(patchNaNs_32f); +} + +////// finiteMask ////// + +#if (CV_SIMD || CV_SIMD_SCALABLE) + +template +int finiteMaskSIMD_(const _Tp *src, uchar *dst, size_t total); + +template <> +int finiteMaskSIMD_(const float *fsrc, uchar *dst, size_t utotal) +{ + const uint32_t* src = (const uint32_t*)fsrc; + const int osize = VTraits::vlanes(); + v_uint32 vmaskExp = vx_setall_u32(0x7f800000); + + int i = 0; + int total = (int)utotal; + for(; i < total - osize + 1; i += osize ) + { + v_uint32 vv0, vv1, vv2, vv3; + vv0 = v_ne(v_and(vx_load(src + i ), vmaskExp), vmaskExp); + vv1 = v_ne(v_and(vx_load(src + i + (osize/4)), vmaskExp), vmaskExp); + vv2 = v_ne(v_and(vx_load(src + i + 2*(osize/4)), vmaskExp), vmaskExp); + vv3 = v_ne(v_and(vx_load(src + i + 3*(osize/4)), vmaskExp), vmaskExp); + + v_store(dst + i, v_pack_b(vv0, vv1, vv2, vv3)); + } + + return i; +} + + +template <> +int finiteMaskSIMD_(const float *fsrc, uchar *dst, size_t utotal) +{ + const uint32_t* src = (const uint32_t*)fsrc; + const int size8 = VTraits::vlanes(); + v_uint32 vmaskExp = vx_setall_u32(0x7f800000); + v_uint16 vmaskBoth = vx_setall_u16(0xffff); + + int i = 0; + int total = (int)utotal; + for(; i < total - (size8 / 2) + 1; i += (size8 / 2) ) + { + v_uint32 vv0, vv1, vv2, vv3; + vv0 = v_ne(v_and(vx_load(src + i*2 ), vmaskExp), vmaskExp); + vv1 = v_ne(v_and(vx_load(src + i*2 + (size8 / 4)), vmaskExp), vmaskExp); + vv2 = v_ne(v_and(vx_load(src + i*2 + 2*(size8 / 4)), vmaskExp), vmaskExp); + vv3 = v_ne(v_and(vx_load(src + i*2 + 3*(size8 / 4)), vmaskExp), vmaskExp); + v_uint8 velems = v_pack_b(vv0, vv1, vv2, vv3); + v_uint16 vfinite = v_eq(v_reinterpret_as_u16(velems), vmaskBoth); + + // 2nd argument in vfinite is useless + v_store_low(dst + i, v_pack(vfinite, vfinite)); + } + + return i; +} + + +template <> +int finiteMaskSIMD_(const float *fsrc, uchar *dst, size_t utotal) +{ + const uint32_t* src = (const uint32_t*)fsrc; + const int npixels = VTraits::vlanes(); + v_uint32 vmaskExp = vx_setall_u32(0x7f800000); + v_uint32 z = vx_setzero_u32(); + + int i = 0; + int total = (int)utotal; + for (; i < total - npixels + 1; i += npixels) + { + v_uint32 vv0, vv1, vv2; + vv0 = v_ne(v_and(vx_load(src + i*3 ), vmaskExp), vmaskExp); + vv1 = v_ne(v_and(vx_load(src + i*3 + npixels), vmaskExp), vmaskExp); + vv2 = v_ne(v_and(vx_load(src + i*3 + 2*npixels), vmaskExp), vmaskExp); + + v_uint8 velems = v_pack_b(vv0, vv1, vv2, z); + + // 2nd arg is useless + v_uint8 vsh1 = v_extract<1>(velems, velems); + v_uint8 vsh2 = v_extract<2>(velems, velems); + + v_uint8 vres3 = v_and(v_and(velems, vsh1), vsh2); + for (int j = 0; j < npixels; j++) + { + dst[i + j] = v_get0(vres3); + // 2nd arg is useless + vres3 = v_extract<3>(vres3, vres3); + } + } + + return i; +} + + +template <> +int finiteMaskSIMD_(const float *fsrc, uchar *dst, size_t utotal) +{ + const uint32_t* src = (const uint32_t*)fsrc; + const int npixels = VTraits::vlanes() / 2; + const int nfloats = VTraits::vlanes(); + const v_uint32 vMaskExp = vx_setall_u32(0x7f800000); + v_uint32 vmaskAll4 = vx_setall_u32(0xFFFFFFFF); + + int i = 0; + int total = (int)utotal; + for(; i < total - npixels + 1; i += npixels ) + { + v_uint32 v0 = vx_load(src + i * 4 + 0*nfloats); + v_uint32 v1 = vx_load(src + i * 4 + 1*nfloats); + v_uint32 v2 = vx_load(src + i * 4 + 2*nfloats); + v_uint32 v3 = vx_load(src + i * 4 + 3*nfloats); + v_uint32 v4 = vx_load(src + i * 4 + 4*nfloats); + v_uint32 v5 = vx_load(src + i * 4 + 5*nfloats); + v_uint32 v6 = vx_load(src + i * 4 + 6*nfloats); + v_uint32 v7 = vx_load(src + i * 4 + 7*nfloats); + + v_uint32 vmask0 = v_ne(v_and(v0, vMaskExp), vMaskExp); + v_uint32 vmask1 = v_ne(v_and(v1, vMaskExp), vMaskExp); + v_uint32 vmask2 = v_ne(v_and(v2, vMaskExp), vMaskExp); + v_uint32 vmask3 = v_ne(v_and(v3, vMaskExp), vMaskExp); + v_uint32 vmask4 = v_ne(v_and(v4, vMaskExp), vMaskExp); + v_uint32 vmask5 = v_ne(v_and(v5, vMaskExp), vMaskExp); + v_uint32 vmask6 = v_ne(v_and(v6, vMaskExp), vMaskExp); + v_uint32 vmask7 = v_ne(v_and(v7, vMaskExp), vMaskExp); + + v_uint8 velems0 = v_pack_b(vmask0, vmask1, vmask2, vmask3); + v_uint8 velems1 = v_pack_b(vmask4, vmask5, vmask6, vmask7); + + v_uint32 vresWide0 = v_eq(v_reinterpret_as_u32(velems0), vmaskAll4); + v_uint32 vresWide1 = v_eq(v_reinterpret_as_u32(velems1), vmaskAll4); + + // last 2 args are useless + v_uint8 vres = v_pack_b(vresWide0, vresWide1, vresWide0, vresWide1); + + v_store_low(dst + i, vres); + } + + return i; +} + + +template <> +int finiteMaskSIMD_(const double *dsrc, uchar *dst, size_t utotal) +{ + const uint64_t* src = (const uint64_t*)dsrc; + const int size8 = VTraits::vlanes(); + int i = 0; + int total = (int)utotal; + + v_uint64 vmaskExp = vx_setall_u64(0x7ff0000000000000); + v_uint64 z = vx_setzero_u64(); + + for(; i < total - (size8 / 2) + 1; i += (size8 / 2) ) + { + v_uint64 vv0, vv1, vv2, vv3; + vv0 = v_ne(v_and(vx_load(src + i ), vmaskExp), vmaskExp); + vv1 = v_ne(v_and(vx_load(src + i + (size8 / 8)), vmaskExp), vmaskExp); + vv2 = v_ne(v_and(vx_load(src + i + 2*(size8 / 8)), vmaskExp), vmaskExp); + vv3 = v_ne(v_and(vx_load(src + i + 3*(size8 / 8)), vmaskExp), vmaskExp); + + v_uint8 v = v_pack_b(vv0, vv1, vv2, vv3, z, z, z, z); + + v_store_low(dst + i, v); + } + + return i; +} + +template <> +int finiteMaskSIMD_(const double *dsrc, uchar *dst, size_t utotal) +{ + const uint64_t* src = (const uint64_t*)dsrc; + const int npixels = VTraits::vlanes() / 2; + const int ndoubles = VTraits::vlanes(); + v_uint64 vmaskExp = vx_setall_u64(0x7ff0000000000000); + v_uint16 vmaskBoth = vx_setall_u16(0xffff); + + int i = 0; + int total = (int)utotal; + for(; i < total - npixels + 1; i += npixels ) + { + v_uint64 vv0 = v_ne(v_and(vx_load(src + i*2 + 0*ndoubles), vmaskExp), vmaskExp); + v_uint64 vv1 = v_ne(v_and(vx_load(src + i*2 + 1*ndoubles), vmaskExp), vmaskExp); + v_uint64 vv2 = v_ne(v_and(vx_load(src + i*2 + 2*ndoubles), vmaskExp), vmaskExp); + v_uint64 vv3 = v_ne(v_and(vx_load(src + i*2 + 3*ndoubles), vmaskExp), vmaskExp); + v_uint64 vv4 = v_ne(v_and(vx_load(src + i*2 + 4*ndoubles), vmaskExp), vmaskExp); + v_uint64 vv5 = v_ne(v_and(vx_load(src + i*2 + 5*ndoubles), vmaskExp), vmaskExp); + v_uint64 vv6 = v_ne(v_and(vx_load(src + i*2 + 6*ndoubles), vmaskExp), vmaskExp); + v_uint64 vv7 = v_ne(v_and(vx_load(src + i*2 + 7*ndoubles), vmaskExp), vmaskExp); + + v_uint8 velems0 = v_pack_b(vv0, vv1, vv2, vv3, vv4, vv5, vv6, vv7); + + v_uint16 vfinite0 = v_eq(v_reinterpret_as_u16(velems0), vmaskBoth); + + // 2nd arg is useless + v_uint8 vres = v_pack(vfinite0, vfinite0); + v_store_low(dst + i, vres); + } + + return i; +} + + +template <> +int finiteMaskSIMD_(const double *dsrc, uchar *dst, size_t utotal) +{ + //TODO: vectorize it properly + + const uint64_t* src = (const uint64_t*)dsrc; + const int npixels = VTraits::vlanes() / 2; + uint64_t maskExp = 0x7ff0000000000000; + + int i = 0; + int total = (int)utotal; + for(; i < total - npixels + 1; i += npixels ) + { + for (int j = 0; j < npixels; j++) + { + uint64_t val0 = src[i * 3 + j * 3 + 0]; + uint64_t val1 = src[i * 3 + j * 3 + 1]; + uint64_t val2 = src[i * 3 + j * 3 + 2]; + + bool finite = ((val0 & maskExp) != maskExp) && + ((val1 & maskExp) != maskExp) && + ((val2 & maskExp) != maskExp); + + dst[i + j] = finite ? 255 : 0; + } + } + + return i; +} + +template <> +int finiteMaskSIMD_(const double *dsrc, uchar *dst, size_t utotal) +{ + //TODO: vectorize it properly + + uint64_t* src = (uint64_t*)dsrc; + const int npixels = VTraits::vlanes() / 2; + const int ndoubles = VTraits::vlanes(); + v_uint16 vmaskExp16 = vx_setall_u16(0x7ff0); + v_uint32 z = vx_setzero_u32(); + + int i = 0; + int total = (int)utotal; + for(; i < total - npixels + 1; i += npixels ) + { + v_uint16 vexpb0, vexpb1, vexpb2, vexpb3, vexpb4, vexpb5, vexpb6, vexpb7; + v_uint16 dummy; + v_load_deinterleave((uint16_t*)(src + 0*4*ndoubles), dummy, dummy, dummy, vexpb0); + v_load_deinterleave((uint16_t*)(src + 1*4*ndoubles), dummy, dummy, dummy, vexpb1); + v_load_deinterleave((uint16_t*)(src + 2*4*ndoubles), dummy, dummy, dummy, vexpb2); + v_load_deinterleave((uint16_t*)(src + 3*4*ndoubles), dummy, dummy, dummy, vexpb3); + + v_uint16 vcmp0 = v_eq(v_and(vexpb0, vmaskExp16), vmaskExp16); + v_uint16 vcmp1 = v_eq(v_and(vexpb1, vmaskExp16), vmaskExp16); + v_uint16 vcmp2 = v_eq(v_and(vexpb2, vmaskExp16), vmaskExp16); + v_uint16 vcmp3 = v_eq(v_and(vexpb3, vmaskExp16), vmaskExp16); + + v_uint8 velems0 = v_pack(vcmp0, vcmp1); + v_uint8 velems1 = v_pack(vcmp2, vcmp3); + + v_uint32 vResWide0 = v_eq(v_reinterpret_as_u32(velems0), z); + v_uint32 vResWide1 = v_eq(v_reinterpret_as_u32(velems1), z); + + v_uint16 vp16 = v_pack(vResWide0, vResWide1); + + // 2nd arg is useless + v_uint8 vres = v_pack(vp16, vp16); + v_store_low(dst, vres); + + src += npixels * 4; + dst += npixels; + } + + return i; +} + +#endif + + +template +void finiteMask_(const uchar *src, uchar *dst, size_t total) +{ + CV_INSTRUMENT_REGION(); + size_t i = 0; + const _Tp* tsrc = (const _Tp*) src; + +#if (CV_SIMD || CV_SIMD_SCALABLE) + i = finiteMaskSIMD_<_Tp, cn>(tsrc, dst, total); +#endif + + for(; i < total; i++ ) + { + bool finite = true; + for (int c = 0; c < cn; c++) + { + _Tp val = tsrc[i * cn + c]; + finite = finite && !cvIsNaN(val) && !cvIsInf(val); + } + dst[i] = finite ? 255 : 0; + } +} + +FiniteMaskFunc getFiniteMaskFunc(bool isDouble, int cn) +{ + static FiniteMaskFunc tab[] = + { + (FiniteMaskFunc)GET_OPTIMIZED((finiteMask_)), + (FiniteMaskFunc)GET_OPTIMIZED((finiteMask_)), + (FiniteMaskFunc)GET_OPTIMIZED((finiteMask_)), + (FiniteMaskFunc)GET_OPTIMIZED((finiteMask_)), + (FiniteMaskFunc)GET_OPTIMIZED((finiteMask_)), + (FiniteMaskFunc)GET_OPTIMIZED((finiteMask_)), + (FiniteMaskFunc)GET_OPTIMIZED((finiteMask_)), + (FiniteMaskFunc)GET_OPTIMIZED((finiteMask_)), + }; + + int idx = (isDouble ? 4 : 0) + cn - 1; + return tab[idx]; +} + +#endif +CV_CPU_OPTIMIZATION_NAMESPACE_END +} // namespace cv diff --git a/modules/core/src/opencl/finitemask.cl b/modules/core/src/opencl/finitemask.cl new file mode 100644 index 0000000000..85256e63a7 --- /dev/null +++ b/modules/core/src/opencl/finitemask.cl @@ -0,0 +1,44 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html + +// This kernel is compiled with the following possible defines: +// - srcT, cn: source type and number of channels per pixel +// - rowsPerWI: Intel GPU optimization +// - DOUBLE_SUPPORT: enable double support if available + +#ifdef DOUBLE_SUPPORT +#ifdef cl_amd_fp64 +#pragma OPENCL EXTENSION cl_amd_fp64:enable +#elif defined cl_khr_fp64 +#pragma OPENCL EXTENSION cl_khr_fp64:enable +#endif +#endif + +__kernel void finiteMask(__global const uchar * srcptr, int srcstep, int srcoffset, + __global uchar * dstptr, int dststep, int dstoffset, + int rows, int cols ) +{ + int x = get_global_id(0); + int y0 = get_global_id(1) * rowsPerWI; + + if (x < cols) + { + int src_index = mad24(y0, srcstep, mad24(x, (int)sizeof(srcT) * cn, srcoffset)); + int dst_index = mad24(y0, dststep, x + dstoffset); + + for (int y = y0, y1 = min(rows, y0 + rowsPerWI); y < y1; ++y, src_index += srcstep, dst_index += dststep) + { + bool vfinite = true; + + for (int c = 0; c < cn; c++) + { + srcT val = *(__global srcT *)(srcptr + src_index + c * (int)sizeof(srcT)); + + vfinite = vfinite && !isnan(val) & !isinf(val); + } + + *(dstptr + dst_index) = vfinite ? 255 : 0; + } + } +} \ No newline at end of file diff --git a/modules/core/test/ocl/test_arithm.cpp b/modules/core/test/ocl/test_arithm.cpp index da7a003f11..57cd080855 100644 --- a/modules/core/test/ocl/test_arithm.cpp +++ b/modules/core/test/ocl/test_arithm.cpp @@ -1699,8 +1699,36 @@ OCL_TEST_P(ScaleAdd, Mat) //////////////////////////////// PatchNans //////////////////////////////////////////////// -PARAM_TEST_CASE(PatchNaNs, Channels, bool) +template +_Tp randomNan(RNG& rng); + +template<> +float randomNan(RNG& rng) +{ + uint32_t r = rng.next(); + Cv32suf v; + v.u = r; + // exp & set a bit to avoid zero mantissa + v.u = v.u | 0x7f800001; + return v.f; +} + +template<> +double randomNan(RNG& rng) +{ + uint32_t r0 = rng.next(); + uint32_t r1 = rng.next(); + Cv64suf v; + v.u = (uint64_t(r0) << 32) | uint64_t(r1); + // exp &set a bit to avoid zero mantissa + v.u = v.u | 0x7ff0000000000001; + return v.f; +} + + +PARAM_TEST_CASE(PatchNaNs, MatDepth, Channels, bool) { + int ftype; int cn; bool use_roi; double value; @@ -1709,13 +1737,14 @@ PARAM_TEST_CASE(PatchNaNs, Channels, bool) virtual void SetUp() { - cn = GET_PARAM(0); - use_roi = GET_PARAM(1); + ftype = GET_PARAM(0); + cn = GET_PARAM(1); + use_roi = GET_PARAM(2); } void generateTestData() { - const int type = CV_MAKE_TYPE(CV_32F, cn); + const int type = CV_MAKE_TYPE(ftype, cn); Size roiSize = randomSize(1, 10); Border srcBorder = randomBorder(0, use_roi ? MAX_VALUE : 0); @@ -1725,9 +1754,19 @@ PARAM_TEST_CASE(PatchNaNs, Channels, bool) roiSize.width *= cn; for (int y = 0; y < roiSize.height; ++y) { - float * const ptr = src_roi.ptr(y); + float *const ptrf = src_roi.ptr(y); + double *const ptrd = src_roi.ptr(y); for (int x = 0; x < roiSize.width; ++x) - ptr[x] = randomInt(-1, 1) == 0 ? std::numeric_limits::quiet_NaN() : ptr[x]; + { + if (ftype == CV_32F) + { + ptrf[x] = randomInt(-1, 1) == 0 ? randomNan(rng) : ptrf[x]; + } + else if (ftype == CV_64F) + { + ptrd[x] = randomInt(-1, 1) == 0 ? randomNan(rng) : ptrd[x]; + } + } } value = randomDouble(-100, 100); @@ -1754,6 +1793,84 @@ OCL_TEST_P(PatchNaNs, Mat) } } +//////////////////////////////// finiteMask ///////////////////////////////////////////// + + +PARAM_TEST_CASE(FiniteMask, MatDepth, Channels, bool) +{ + int ftype; + int cn; + bool use_roi; + + TEST_DECLARE_INPUT_PARAMETER(src); + TEST_DECLARE_OUTPUT_PARAMETER(mask); + + virtual void SetUp() + { + ftype = GET_PARAM(0); + cn = GET_PARAM(1); + use_roi = GET_PARAM(2); + } + + void generateTestData() + { + const int type = CV_MAKE_TYPE(ftype, cn); + + Size roiSize = randomSize(1, MAX_VALUE); + Border srcBorder = randomBorder(0, use_roi ? MAX_VALUE : 0); + randomSubMat(src, src_roi, roiSize, srcBorder, type, -40, 40); + + randomSubMat(mask, mask_roi, roiSize, srcBorder, CV_8UC1, 5, 16); + + // generating NaNs + const softfloat fpinf = softfloat ::inf(); + const softfloat fninf = softfloat ::inf().setSign(true); + const softdouble dpinf = softdouble::inf(); + const softdouble dninf = softdouble::inf().setSign(true); + for (int y = 0; y < roiSize.height; ++y) + { + float *const ptrf = src_roi.ptr(y); + double *const ptrd = src_roi.ptr(y); + for (int x = 0; x < roiSize.width * cn; ++x) + { + int rem = randomInt(0, 10); + if (ftype == CV_32F) + { + ptrf[x] = rem < 4 ? randomNan(rng) : + rem == 5 ? (float )((x + y)%2 ? fpinf : fninf) : ptrf[x]; + } + else if (ftype == CV_64F) + { + ptrd[x] = rem < 4 ? randomNan(rng) : + rem == 5 ? (double)((x + y)%2 ? dpinf : dninf) : ptrd[x]; + } + } + } + + UMAT_UPLOAD_INPUT_PARAMETER(src); + UMAT_UPLOAD_OUTPUT_PARAMETER(mask); + } + + void Near() + { + OCL_EXPECT_MATS_NEAR(mask, 0); + } +}; + +OCL_TEST_P(FiniteMask, Mat) +{ + for (int j = 0; j < test_loop_times; j++) + { + generateTestData(); + + OCL_OFF(cv::finiteMask(src_roi, mask_roi)); + OCL_ON(cv::finiteMask(usrc_roi, umask_roi)); + + Near(); + } +} + + //////////////////////////////// Psnr //////////////////////////////////////////////// typedef ArithmTestBase Psnr; @@ -1928,7 +2045,8 @@ OCL_INSTANTIATE_TEST_CASE_P(Arithm, InRange, Combine(OCL_ALL_DEPTHS, OCL_ALL_CHA OCL_INSTANTIATE_TEST_CASE_P(Arithm, ConvertScaleAbs, Combine(OCL_ALL_DEPTHS, OCL_ALL_CHANNELS, Bool())); OCL_INSTANTIATE_TEST_CASE_P(Arithm, ConvertFp16, Combine(OCL_ALL_CHANNELS, Bool())); OCL_INSTANTIATE_TEST_CASE_P(Arithm, ScaleAdd, Combine(OCL_ALL_DEPTHS, OCL_ALL_CHANNELS, Bool())); -OCL_INSTANTIATE_TEST_CASE_P(Arithm, PatchNaNs, Combine(OCL_ALL_CHANNELS, Bool())); +OCL_INSTANTIATE_TEST_CASE_P(Arithm, PatchNaNs, Combine(::testing::Values(CV_32F, CV_64F), OCL_ALL_CHANNELS, Bool())); +OCL_INSTANTIATE_TEST_CASE_P(Arithm, FiniteMask, Combine(::testing::Values(CV_32F, CV_64F), OCL_ALL_CHANNELS, Bool())); OCL_INSTANTIATE_TEST_CASE_P(Arithm, Psnr, Combine(::testing::Values((MatDepth)CV_8U), OCL_ALL_CHANNELS, Bool())); OCL_INSTANTIATE_TEST_CASE_P(Arithm, UMatDot, Combine(OCL_ALL_DEPTHS, OCL_ALL_CHANNELS, Bool())); diff --git a/modules/core/test/test_arithm.cpp b/modules/core/test/test_arithm.cpp index a9d71967aa..ffc748b600 100644 --- a/modules/core/test/test_arithm.cpp +++ b/modules/core/test/test_arithm.cpp @@ -729,6 +729,88 @@ struct InRangeOp : public BaseArithmOp } }; +namespace reference { + +template +struct SoftType; + +template<> +struct SoftType +{ + typedef softfloat type; +}; + +template<> +struct SoftType +{ + typedef softdouble type; +}; + + +template +static void finiteMask_(const _Tp *src, uchar *dst, size_t total, int cn) +{ + for(size_t i = 0; i < total; i++ ) + { + bool good = true; + for (int c = 0; c < cn; c++) + { + _Tp val = src[i * cn + c]; + typename SoftType<_Tp>::type sval(val); + + good = good && !sval.isNaN() && !sval.isInf(); + } + dst[i] = good ? 255 : 0; + } +} + +static void finiteMask(const Mat& src, Mat& dst) +{ + dst.create(src.dims, &src.size[0], CV_8UC1); + + const Mat *arrays[]={&src, &dst, 0}; + Mat planes[2]; + NAryMatIterator it(arrays, planes); + size_t total = planes[0].total(); + size_t i, nplanes = it.nplanes; + int depth = src.depth(), cn = src.channels(); + + for( i = 0; i < nplanes; i++, ++it ) + { + const uchar* sptr = planes[0].ptr(); + uchar* dptr = planes[1].ptr(); + + switch( depth ) + { + case CV_32F: finiteMask_((const float*)sptr, dptr, total, cn); break; + case CV_64F: finiteMask_((const double*)sptr, dptr, total, cn); break; + } + } +} +} + + +struct FiniteMaskOp : public BaseElemWiseOp +{ + FiniteMaskOp() : BaseElemWiseOp(1, 0, 1, 1, Scalar::all(0)) {} + void op(const vector& src, Mat& dst, const Mat&) + { + cv::finiteMask(src[0], dst); + } + void refop(const vector& src, Mat& dst, const Mat&) + { + reference::finiteMask(src[0], dst); + } + int getRandomType(RNG& rng) + { + return cvtest::randomType(rng, _OutputArray::DEPTH_MASK_FLT, 1, 4); + } + double getMaxErr(int) + { + return 0; + } +}; + struct ConvertScaleOp : public BaseElemWiseOp { @@ -1573,6 +1655,8 @@ INSTANTIATE_TEST_CASE_P(Core_CmpS, ElemWiseTest, ::testing::Values(ElemWiseOpPtr INSTANTIATE_TEST_CASE_P(Core_InRangeS, ElemWiseTest, ::testing::Values(ElemWiseOpPtr(new InRangeSOp))); INSTANTIATE_TEST_CASE_P(Core_InRange, ElemWiseTest, ::testing::Values(ElemWiseOpPtr(new InRangeOp))); +INSTANTIATE_TEST_CASE_P(Core_FiniteMask, ElemWiseTest, ::testing::Values(ElemWiseOpPtr(new FiniteMaskOp))); + INSTANTIATE_TEST_CASE_P(Core_Flip, ElemWiseTest, ::testing::Values(ElemWiseOpPtr(new FlipOp))); INSTANTIATE_TEST_CASE_P(Core_Transpose, ElemWiseTest, ::testing::Values(ElemWiseOpPtr(new TransposeOp))); INSTANTIATE_TEST_CASE_P(Core_SetIdentity, ElemWiseTest, ::testing::Values(ElemWiseOpPtr(new SetIdentityOp))); @@ -2876,4 +2960,76 @@ TEST(Core_CartPolar, inplace) EXPECT_THROW(cv::cartToPolar(uA[0], uA[1], uA[0], uA[1]), cv::Exception); } + +// Check different values for finiteMask() + +template +_Tp randomNan(RNG& rng); + +template<> +float randomNan(RNG& rng) +{ + uint32_t r = rng.next(); + Cv32suf v; + v.u = r; + // exp & set a bit to avoid zero mantissa + v.u = v.u | 0x7f800001; + return v.f; +} + +template<> +double randomNan(RNG& rng) +{ + uint32_t r0 = rng.next(); + uint32_t r1 = rng.next(); + Cv64suf v; + v.u = (uint64_t(r0) << 32) | uint64_t(r1); + // exp &set a bit to avoid zero mantissa + v.u = v.u | 0x7ff0000000000001; + return v.f; +} + +template +Mat generateFiniteMaskData(int cn, RNG& rng) +{ + typedef typename reference::SoftType::type SFT; + + SFT pinf = SFT::inf(); + SFT ninf = SFT::inf().setSign(true); + + const int len = 100; + Mat_ plainData(1, cn*len); + for(int i = 0; i < cn*len; i++) + { + int r = rng.uniform(0, 3); + plainData(i) = r == 0 ? T(rng.uniform(0, 2) ? pinf : ninf) : + r == 1 ? randomNan(rng) : T(0); + } + + return Mat(plainData).reshape(cn); +} + +typedef std::tuple FiniteMaskFixtureParams; +class FiniteMaskFixture : public ::testing::TestWithParam {}; + +TEST_P(FiniteMaskFixture, flags) +{ + auto p = GetParam(); + int depth = get<0>(p); + int channels = get<1>(p); + + RNG rng((uint64)ARITHM_RNG_SEED); + Mat data = (depth == CV_32F) ? generateFiniteMaskData(channels, rng) + /* CV_64F */ : generateFiniteMaskData(channels, rng); + + Mat nans, gtNans; + cv::finiteMask(data, nans); + reference::finiteMask(data, gtNans); + + EXPECT_MAT_NEAR(nans, gtNans, 0); +} + +// Params are: depth, channels 1 to 4 +INSTANTIATE_TEST_CASE_P(Core_FiniteMask, FiniteMaskFixture, ::testing::Combine(::testing::Values(CV_32F, CV_64F), ::testing::Range(1, 5))); + }} // namespace diff --git a/modules/core/test/test_precomp.hpp b/modules/core/test/test_precomp.hpp index 81ddf45de9..8d9a931db4 100644 --- a/modules/core/test/test_precomp.hpp +++ b/modules/core/test/test_precomp.hpp @@ -8,5 +8,6 @@ #include "opencv2/ts/ocl_test.hpp" #include "opencv2/core/private.hpp" #include "opencv2/core/hal/hal.hpp" +#include "opencv2/core/softfloat.hpp" #endif