From fd832bb57df93e1a9d471f0f4d377819e7bac391 Mon Sep 17 00:00:00 2001 From: Alexander Alekhin Date: Sun, 14 Oct 2018 00:37:10 +0000 Subject: [PATCH] core: follow IEEE 754 rules for floating-point division --- 3rdparty/carotene/src/div.cpp | 58 ++++++++++----------------- modules/core/include/opencv2/core.hpp | 9 ++++- modules/core/src/arithm_core.hpp | 14 +++++-- modules/core/src/arithm_simd.hpp | 16 -------- modules/core/src/opencl/arithm.cl | 24 +++++++++++ 5 files changed, 63 insertions(+), 58 deletions(-) diff --git a/3rdparty/carotene/src/div.cpp b/3rdparty/carotene/src/div.cpp index cb5f1e7e94..38892acab3 100644 --- a/3rdparty/carotene/src/div.cpp +++ b/3rdparty/carotene/src/div.cpp @@ -151,6 +151,10 @@ void div(const Size2D &size, typedef typename internal::VecTraits::vec128 vec128; typedef typename internal::VecTraits::vec64 vec64; +#if defined(__GNUC__) && (defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103L) + static_assert(std::numeric_limits::is_integer, "template implementation is for integer types only"); +#endif + if (scale == 0.0f || (std::numeric_limits::is_integer && (scale * std::numeric_limits::max()) < 1.0f && @@ -311,6 +315,10 @@ void recip(const Size2D &size, typedef typename internal::VecTraits::vec128 vec128; typedef typename internal::VecTraits::vec64 vec64; +#if defined(__GNUC__) && (defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103L) + static_assert(std::numeric_limits::is_integer, "template implementation is for integer types only"); +#endif + if (scale == 0.0f || (std::numeric_limits::is_integer && scale < 1.0f && @@ -463,8 +471,6 @@ void div(const Size2D &size, return; } - float32x4_t v_zero = vdupq_n_f32(0.0f); - size_t roiw128 = size.width >= 3 ? size.width - 3 : 0; size_t roiw64 = size.width >= 1 ? size.width - 1 : 0; @@ -485,9 +491,7 @@ void div(const Size2D &size, float32x4_t v_src0 = vld1q_f32(src0 + j); float32x4_t v_src1 = vld1q_f32(src1 + j); - uint32x4_t v_mask = vceqq_f32(v_src1,v_zero); - vst1q_f32(dst + j, vreinterpretq_f32_u32(vbicq_u32( - vreinterpretq_u32_f32(vmulq_f32(v_src0, internal::vrecpq_f32(v_src1))), v_mask))); + vst1q_f32(dst + j, vmulq_f32(v_src0, internal::vrecpq_f32(v_src1))); } for (; j < roiw64; j += 2) @@ -495,14 +499,12 @@ void div(const Size2D &size, float32x2_t v_src0 = vld1_f32(src0 + j); float32x2_t v_src1 = vld1_f32(src1 + j); - uint32x2_t v_mask = vceq_f32(v_src1,vget_low_f32(v_zero)); - vst1_f32(dst + j, vreinterpret_f32_u32(vbic_u32( - vreinterpret_u32_f32(vmul_f32(v_src0, internal::vrecp_f32(v_src1))), v_mask))); + vst1_f32(dst + j, vmul_f32(v_src0, internal::vrecp_f32(v_src1))); } for (; j < size.width; j++) { - dst[j] = src1[j] ? src0[j] / src1[j] : 0.0f; + dst[j] = src0[j] / src1[j]; } } } @@ -523,10 +525,8 @@ void div(const Size2D &size, float32x4_t v_src0 = vld1q_f32(src0 + j); float32x4_t v_src1 = vld1q_f32(src1 + j); - uint32x4_t v_mask = vceqq_f32(v_src1,v_zero); - vst1q_f32(dst + j, vreinterpretq_f32_u32(vbicq_u32( - vreinterpretq_u32_f32(vmulq_f32(vmulq_n_f32(v_src0, scale), - internal::vrecpq_f32(v_src1))), v_mask))); + vst1q_f32(dst + j, vmulq_f32(vmulq_n_f32(v_src0, scale), + internal::vrecpq_f32(v_src1))); } for (; j < roiw64; j += 2) @@ -534,15 +534,13 @@ void div(const Size2D &size, float32x2_t v_src0 = vld1_f32(src0 + j); float32x2_t v_src1 = vld1_f32(src1 + j); - uint32x2_t v_mask = vceq_f32(v_src1,vget_low_f32(v_zero)); - vst1_f32(dst + j, vreinterpret_f32_u32(vbic_u32( - vreinterpret_u32_f32(vmul_f32(vmul_n_f32(v_src0, scale), - internal::vrecp_f32(v_src1))), v_mask))); + vst1_f32(dst + j, vmul_f32(vmul_n_f32(v_src0, scale), + internal::vrecp_f32(v_src1))); } for (; j < size.width; j++) { - dst[j] = src1[j] ? src0[j] * scale / src1[j] : 0.0f; + dst[j] = src0[j] * scale / src1[j]; } } } @@ -620,8 +618,6 @@ void reciprocal(const Size2D &size, return; } - float32x4_t v_zero = vdupq_n_f32(0.0f); - size_t roiw128 = size.width >= 3 ? size.width - 3 : 0; size_t roiw64 = size.width >= 1 ? size.width - 1 : 0; @@ -639,23 +635,19 @@ void reciprocal(const Size2D &size, float32x4_t v_src1 = vld1q_f32(src1 + j); - uint32x4_t v_mask = vceqq_f32(v_src1,v_zero); - vst1q_f32(dst + j, vreinterpretq_f32_u32(vbicq_u32( - vreinterpretq_u32_f32(internal::vrecpq_f32(v_src1)), v_mask))); + vst1q_f32(dst + j, internal::vrecpq_f32(v_src1)); } for (; j < roiw64; j += 2) { float32x2_t v_src1 = vld1_f32(src1 + j); - uint32x2_t v_mask = vceq_f32(v_src1,vget_low_f32(v_zero)); - vst1_f32(dst + j, vreinterpret_f32_u32(vbic_u32( - vreinterpret_u32_f32(internal::vrecp_f32(v_src1)), v_mask))); + vst1_f32(dst + j, internal::vrecp_f32(v_src1)); } for (; j < size.width; j++) { - dst[j] = src1[j] ? 1.0f / src1[j] : 0; + dst[j] = 1.0f / src1[j]; } } } @@ -673,25 +665,19 @@ void reciprocal(const Size2D &size, float32x4_t v_src1 = vld1q_f32(src1 + j); - uint32x4_t v_mask = vceqq_f32(v_src1,v_zero); - vst1q_f32(dst + j, vreinterpretq_f32_u32(vbicq_u32( - vreinterpretq_u32_f32(vmulq_n_f32(internal::vrecpq_f32(v_src1), - scale)),v_mask))); + vst1q_f32(dst + j, vmulq_n_f32(internal::vrecpq_f32(v_src1), scale)); } for (; j < roiw64; j += 2) { float32x2_t v_src1 = vld1_f32(src1 + j); - uint32x2_t v_mask = vceq_f32(v_src1,vget_low_f32(v_zero)); - vst1_f32(dst + j, vreinterpret_f32_u32(vbic_u32( - vreinterpret_u32_f32(vmul_n_f32(internal::vrecp_f32(v_src1), - scale)), v_mask))); + vst1_f32(dst + j, vmul_n_f32(internal::vrecp_f32(v_src1), scale)); } for (; j < size.width; j++) { - dst[j] = src1[j] ? scale / src1[j] : 0; + dst[j] = scale / src1[j]; } } } diff --git a/modules/core/include/opencv2/core.hpp b/modules/core/include/opencv2/core.hpp index c8216b28f3..73456b1fb2 100644 --- a/modules/core/include/opencv2/core.hpp +++ b/modules/core/include/opencv2/core.hpp @@ -415,8 +415,13 @@ The function cv::divide divides one array by another: or a scalar by an array when there is no src1 : \f[\texttt{dst(I) = saturate(scale/src2(I))}\f] -When src2(I) is zero, dst(I) will also be zero. Different channels of -multi-channel arrays are processed independently. +Different channels of multi-channel arrays are processed independently. + +For integer types when src2(I) is zero, dst(I) will also be zero. + +@note In case of floating point data there is no special defined behavior for zero src2(I) values. +Regular floating-point division is used. +Expect correct IEEE-754 behaviour for floating-point data (with NaN, Inf result values). @note Saturation is not applied when the output array has the depth CV_32S. You may even get result of an incorrect sign in the case of overflow. diff --git a/modules/core/src/arithm_core.hpp b/modules/core/src/arithm_core.hpp index 99b564cf74..7b7d6f7d85 100644 --- a/modules/core/src/arithm_core.hpp +++ b/modules/core/src/arithm_core.hpp @@ -516,7 +516,10 @@ div_i( const T* src1, size_t step1, const T* src2, size_t step2, for( ; i < width; i++ ) { T num = src1[i], denom = src2[i]; - dst[i] = denom != 0 ? saturate_cast(num*scale_f/denom) : (T)0; + T v = 0; + if (denom != 0) + v = saturate_cast(num*scale_f/denom); + dst[i] = v; } } } @@ -538,7 +541,7 @@ div_f( const T* src1, size_t step1, const T* src2, size_t step2, for( ; i < width; i++ ) { T num = src1[i], denom = src2[i]; - dst[i] = denom != 0 ? saturate_cast(num*scale_f/denom) : (T)0; + dst[i] = saturate_cast(num*scale_f/denom); } } } @@ -559,7 +562,10 @@ recip_i( const T* src2, size_t step2, for( ; i < width; i++ ) { T denom = src2[i]; - dst[i] = denom != 0 ? saturate_cast(scale_f/denom) : (T)0; + T v = 0; + if (denom != 0) + v = saturate_cast(scale_f/denom); + dst[i] = v; } } } @@ -580,7 +586,7 @@ recip_f( const T* src2, size_t step2, for( ; i < width; i++ ) { T denom = src2[i]; - dst[i] = denom != 0 ? saturate_cast(scale_f/denom) : (T)0; + dst[i] = saturate_cast(scale_f/denom); } } } diff --git a/modules/core/src/arithm_simd.hpp b/modules/core/src/arithm_simd.hpp index 5a37b4c200..98a0126d20 100644 --- a/modules/core/src/arithm_simd.hpp +++ b/modules/core/src/arithm_simd.hpp @@ -1433,7 +1433,6 @@ struct Div_SIMD return x; v_float32x4 v_scale = v_setall_f32((float)scale); - v_float32x4 v_zero = v_setzero_f32(); for ( ; x <= width - 8; x += 8) { @@ -1445,9 +1444,6 @@ struct Div_SIMD v_float32x4 res0 = f0 * v_scale / f2; v_float32x4 res1 = f1 * v_scale / f3; - res0 = v_select(f2 == v_zero, v_zero, res0); - res1 = v_select(f3 == v_zero, v_zero, res1); - v_store(dst + x, res0); v_store(dst + x + 4, res1); } @@ -1675,7 +1671,6 @@ struct Recip_SIMD return x; v_float32x4 v_scale = v_setall_f32((float)scale); - v_float32x4 v_zero = v_setzero_f32(); for ( ; x <= width - 8; x += 8) { @@ -1685,9 +1680,6 @@ struct Recip_SIMD v_float32x4 res0 = v_scale / f0; v_float32x4 res1 = v_scale / f1; - res0 = v_select(f0 == v_zero, v_zero, res0); - res1 = v_select(f1 == v_zero, v_zero, res1); - v_store(dst + x, res0); v_store(dst + x + 4, res1); } @@ -1712,7 +1704,6 @@ struct Div_SIMD return x; v_float64x2 v_scale = v_setall_f64(scale); - v_float64x2 v_zero = v_setzero_f64(); for ( ; x <= width - 4; x += 4) { @@ -1724,9 +1715,6 @@ struct Div_SIMD v_float64x2 res0 = f0 * v_scale / f2; v_float64x2 res1 = f1 * v_scale / f3; - res0 = v_select(f2 == v_zero, v_zero, res0); - res1 = v_select(f3 == v_zero, v_zero, res1); - v_store(dst + x, res0); v_store(dst + x + 2, res1); } @@ -1749,7 +1737,6 @@ struct Recip_SIMD return x; v_float64x2 v_scale = v_setall_f64(scale); - v_float64x2 v_zero = v_setzero_f64(); for ( ; x <= width - 4; x += 4) { @@ -1759,9 +1746,6 @@ struct Recip_SIMD v_float64x2 res0 = v_scale / f0; v_float64x2 res1 = v_scale / f1; - res0 = v_select(f0 == v_zero, v_zero, res0); - res1 = v_select(f1 == v_zero, v_zero, res1); - v_store(dst + x, res0); v_store(dst + x + 2, res1); } diff --git a/modules/core/src/opencl/arithm.cl b/modules/core/src/opencl/arithm.cl index b037a07d09..d4165faae3 100644 --- a/modules/core/src/opencl/arithm.cl +++ b/modules/core/src/opencl/arithm.cl @@ -227,9 +227,15 @@ #define PROCESS_ELEM storedst(convertToDT(srcelem1 * scale * srcelem2)) #elif defined OP_DIV +#ifdef CV_DST_TYPE_IS_INTEGER #define PROCESS_ELEM \ workT e2 = srcelem2, zero = (workT)(0); \ storedst(convertToDT(e2 != zero ? srcelem1 / e2 : zero)) +#else +#define PROCESS_ELEM \ + workT e2 = srcelem2; \ + storedst(convertToDT(srcelem1 / e2)) +#endif #elif defined OP_DIV_SCALE #undef EXTRA_PARAMS @@ -240,9 +246,15 @@ #else #define EXTRA_PARAMS , scaleT scale #endif +#ifdef CV_DST_TYPE_IS_INTEGER #define PROCESS_ELEM \ workT e2 = srcelem2, zero = (workT)(0); \ storedst(convertToDT(e2 == zero ? zero : (srcelem1 * (workT)(scale) / e2))) +#else +#define PROCESS_ELEM \ + workT e2 = srcelem2; \ + storedst(convertToDT(srcelem1 * (workT)(scale) / e2)) +#endif #elif defined OP_RDIV_SCALE #undef EXTRA_PARAMS @@ -253,16 +265,28 @@ #else #define EXTRA_PARAMS , scaleT scale #endif +#ifdef CV_DST_TYPE_IS_INTEGER #define PROCESS_ELEM \ workT e1 = srcelem1, zero = (workT)(0); \ storedst(convertToDT(e1 == zero ? zero : (srcelem2 * (workT)(scale) / e1))) +#else +#define PROCESS_ELEM \ + workT e1 = srcelem1; \ + storedst(convertToDT(srcelem2 * (workT)(scale) / e1)) +#endif #elif defined OP_RECIP_SCALE #undef EXTRA_PARAMS #define EXTRA_PARAMS , scaleT scale +#ifdef CV_DST_TYPE_IS_INTEGER #define PROCESS_ELEM \ workT e1 = srcelem1, zero = (workT)(0); \ storedst(convertToDT(e1 != zero ? scale / e1 : zero)) +#else +#define PROCESS_ELEM \ + workT e1 = srcelem1; \ + storedst(convertToDT(scale / e1)) +#endif #elif defined OP_ADDW #undef EXTRA_PARAMS