Merge pull request #24480 from savuor:backport_patch_nans

Backport to 4.x: patchNaNs() SIMD acceleration #24480

backport from #23098
connected PR in extra: [#1118@extra](https://github.com/opencv/opencv_extra/pull/1118)

### This PR contains:
* new SIMD code for `patchNaNs()`
* CPU perf test

<details>
<summary>Performance comparison</summary>

Geometric mean (ms)

|Name of Test|noopt|sse2|avx2|sse2 vs noopt (x-factor)|avx2 vs noopt (x-factor)|
|---|:-:|:-:|:-:|:-:|:-:|
|PatchNaNs::OCL_PatchNaNsFixture::(640x480, 32FC1)|0.019|0.017|0.018|1.11|1.07|
|PatchNaNs::OCL_PatchNaNsFixture::(640x480, 32FC4)|0.037|0.037|0.033|1.00|1.10|
|PatchNaNs::OCL_PatchNaNsFixture::(1280x720, 32FC1)|0.032|0.032|0.033|0.99|0.98|
|PatchNaNs::OCL_PatchNaNsFixture::(1280x720, 32FC4)|0.072|0.072|0.070|1.00|1.03|
|PatchNaNs::OCL_PatchNaNsFixture::(1920x1080, 32FC1)|0.051|0.051|0.050|1.00|1.01|
|PatchNaNs::OCL_PatchNaNsFixture::(1920x1080, 32FC4)|0.137|0.138|0.128|0.99|1.06|
|PatchNaNs::OCL_PatchNaNsFixture::(3840x2160, 32FC1)|0.137|0.128|0.129|1.07|1.06|
|PatchNaNs::OCL_PatchNaNsFixture::(3840x2160, 32FC4)|0.450|0.450|0.448|1.00|1.01|
|PatchNaNs::PatchNaNsFixture::(640x480, 32FC1)|0.149|0.029|0.020|5.13|7.44|
|PatchNaNs::PatchNaNsFixture::(640x480, 32FC2)|0.304|0.058|0.040|5.25|7.65|
|PatchNaNs::PatchNaNsFixture::(640x480, 32FC3)|0.448|0.086|0.059|5.22|7.55|
|PatchNaNs::PatchNaNsFixture::(640x480, 32FC4)|0.601|0.133|0.083|4.51|7.23|
|PatchNaNs::PatchNaNsFixture::(1280x720, 32FC1)|0.451|0.093|0.060|4.83|7.52|
|PatchNaNs::PatchNaNsFixture::(1280x720, 32FC2)|0.892|0.184|0.126|4.85|7.06|
|PatchNaNs::PatchNaNsFixture::(1280x720, 32FC3)|1.345|0.311|0.230|4.32|5.84|
|PatchNaNs::PatchNaNsFixture::(1280x720, 32FC4)|1.831|0.546|0.436|3.35|4.20|
|PatchNaNs::PatchNaNsFixture::(1920x1080, 32FC1)|1.017|0.250|0.160|4.06|6.35|
|PatchNaNs::PatchNaNsFixture::(1920x1080, 32FC2)|2.077|0.646|0.605|3.21|3.43|
|PatchNaNs::PatchNaNsFixture::(1920x1080, 32FC3)|3.134|1.053|0.961|2.97|3.26|
|PatchNaNs::PatchNaNsFixture::(1920x1080, 32FC4)|4.222|1.436|1.288|2.94|3.28|
|PatchNaNs::PatchNaNsFixture::(3840x2160, 32FC1)|4.225|1.401|1.277|3.01|3.31|
|PatchNaNs::PatchNaNsFixture::(3840x2160, 32FC2)|8.310|2.953|2.635|2.81|3.15|
|PatchNaNs::PatchNaNsFixture::(3840x2160, 32FC3)|12.396|4.455|4.252|2.78|2.92|
|PatchNaNs::PatchNaNsFixture::(3840x2160, 32FC4)|17.174|5.831|5.824|2.95|2.95|

</details>

### Pull Request Readiness Checklist

See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request

- [x] I agree to contribute to the project under Apache 2 License.
- [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [x] The PR is proposed to the proper branch
- [x] There is a reference to the original bug report and related work
- [x] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
      Patch to opencv_extra has the same branch name.
- [x] The feature is well documented and sample code can be built with the project CMake
pull/24492/head
Rostislav Vasilikhin 1 year ago committed by GitHub
parent 7c9231ffba
commit ea47cb3ffe
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
  1. 2
      modules/core/include/opencv2/core.hpp
  2. 66
      modules/core/perf/perf_arithm.cpp
  3. 35
      modules/core/src/mathfuncs.cpp

@ -1697,7 +1697,7 @@ elements.
CV_EXPORTS_W bool checkRange(InputArray a, bool quiet = true, CV_OUT Point* pos = 0,
double minVal = -DBL_MAX, double maxVal = DBL_MAX);
/** @brief converts NaNs to the given number
/** @brief Replaces NaNs by given number
@param a input/output matrix (CV_32F type).
@param val value to convert the NaNs
*/

@ -1,5 +1,6 @@
#include "perf_precomp.hpp"
#include <numeric>
#include "opencv2/core/softfloat.hpp"
namespace opencv_test
{
@ -451,4 +452,69 @@ INSTANTIATE_TEST_CASE_P(/*nothing*/ , BinaryOpTest,
)
);
///////////// PatchNaNs ////////////////////////
template<typename _Tp>
_Tp randomNan(RNG& rng);
template<>
float randomNan(RNG& rng)
{
uint32_t r = rng.next();
Cv32suf v;
v.u = r;
// exp & set a bit to avoid zero mantissa
v.u = v.u | 0x7f800001;
return v.f;
}
template<>
double randomNan(RNG& rng)
{
uint32_t r0 = rng.next();
uint32_t r1 = rng.next();
Cv64suf v;
v.u = (uint64_t(r0) << 32) | uint64_t(r1);
// exp &set a bit to avoid zero mantissa
v.u = v.u | 0x7ff0000000000001;
return v.f;
}
typedef Size_MatType PatchNaNsFixture;
PERF_TEST_P_(PatchNaNsFixture, PatchNaNs)
{
const Size_MatType_t params = GetParam();
Size srcSize = get<0>(params);
const int type = get<1>(params), cn = CV_MAT_CN(type);
Mat src(srcSize, type);
declare.in(src, WARMUP_RNG).out(src);
// generating NaNs
{
srcSize.width *= cn;
RNG& rng = theRNG();
for (int y = 0; y < srcSize.height; ++y)
{
float *const ptrf = src.ptr<float>(y);
for (int x = 0; x < srcSize.width; ++x)
{
ptrf[x] = (x + y) % 2 == 0 ? randomNan<float >(rng) : ptrf[x];
}
}
}
TEST_CYCLE() cv::patchNaNs(src, 17.7);
SANITY_CHECK(src);
}
INSTANTIATE_TEST_CASE_P(/*nothing*/ , PatchNaNsFixture,
testing::Combine(
testing::Values(szVGA, sz720p, sz1080p, sz2160p),
testing::Values(CV_32FC1, CV_32FC2, CV_32FC3, CV_32FC4)
)
);
} // namespace

@ -1610,30 +1610,37 @@ void patchNaNs( InputOutputArray _a, double _val )
const Mat* arrays[] = {&a, 0};
int* ptrs[1] = {};
NAryMatIterator it(arrays, (uchar**)ptrs);
size_t len = it.size*a.channels();
int len = (int)(it.size*a.channels());
Cv32suf val;
val.f = (float)_val;
#if (CV_SIMD || CV_SIMD_SCALABLE)
v_int32 v_mask1 = vx_setall_s32(0x7fffffff), v_mask2 = vx_setall_s32(0x7f800000);
v_int32 v_val = vx_setall_s32(val.i);
#endif
for( size_t i = 0; i < it.nplanes; i++, ++it )
{
int* tptr = ptrs[0];
size_t j = 0;
int j = 0;
#if (CV_SIMD || CV_SIMD_SCALABLE)
size_t cWidth = (size_t)VTraits<v_int32>::vlanes();
for ( ; j + cWidth <= len; j += cWidth)
v_int32 v_pos_mask = vx_setall_s32(0x7fffffff), v_exp_mask = vx_setall_s32(0x7f800000);
v_int32 v_val = vx_setall_s32(val.i);
int cWidth = VTraits<v_int32>::vlanes();
for (; j < len - cWidth * 2 + 1; j += cWidth * 2)
{
v_int32 v_src = vx_load(tptr + j);
v_int32 v_cmp_mask = v_lt(v_mask2, v_and(v_src, v_mask1));
v_int32 v_dst = v_select(v_cmp_mask, v_val, v_src);
v_store(tptr + j, v_dst);
v_int32 v_src0 = vx_load(tptr + j);
v_int32 v_src1 = vx_load(tptr + j + cWidth);
v_int32 v_cmp_mask0 = v_lt(v_exp_mask, v_and(v_src0, v_pos_mask));
v_int32 v_cmp_mask1 = v_lt(v_exp_mask, v_and(v_src1, v_pos_mask));
if (v_check_any(v_or(v_cmp_mask0, v_cmp_mask1)))
{
v_int32 v_dst0 = v_select(v_cmp_mask0, v_val, v_src0);
v_int32 v_dst1 = v_select(v_cmp_mask1, v_val, v_src1);
v_store(tptr + j, v_dst0);
v_store(tptr + j + cWidth, v_dst1);
}
}
vx_cleanup();
#endif
for( ; j < len; j++ )

Loading…
Cancel
Save