From f5b6bea2d4cabdae1901f910ee2eed02a080a203 Mon Sep 17 00:00:00 2001
From: Vitaly Tuzov <terfendail@mediana.jetos.com>
Date: Mon, 12 Nov 2018 20:42:58 +0300
Subject: [PATCH] Raised bilateralFilter processing precision for CV_32F
 matrices containing NaNs

---
 .../include/opencv2/core/hal/intrin_avx.hpp   |  5 +
 .../include/opencv2/core/hal/intrin_cpp.hpp   | 19 ++++
 .../include/opencv2/core/hal/intrin_neon.hpp  |  7 ++
 .../include/opencv2/core/hal/intrin_sse.hpp   |  5 +
 .../include/opencv2/core/hal/intrin_vsx.hpp   |  5 +
 modules/imgproc/src/bilateral_filter.cpp      | 97 ++++++++++++-------
 6 files changed, 104 insertions(+), 34 deletions(-)

diff --git a/modules/core/include/opencv2/core/hal/intrin_avx.hpp b/modules/core/include/opencv2/core/hal/intrin_avx.hpp
index f8cc7a4d00..30377048ae 100644
--- a/modules/core/include/opencv2/core/hal/intrin_avx.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_avx.hpp
@@ -905,6 +905,11 @@ OPENCV_HAL_IMPL_AVX_CMP_OP_64BIT(v_int64x4)
 OPENCV_HAL_IMPL_AVX_CMP_OP_FLT(v_float32x8, ps)
 OPENCV_HAL_IMPL_AVX_CMP_OP_FLT(v_float64x4, pd)
 
+inline v_float32x8 v_not_nan(const v_float32x8& a)
+{ return v_float32x8(_mm256_cmp_ps(a.val, a.val, _CMP_ORD_Q)); }
+inline v_float64x4 v_not_nan(const v_float64x4& a)
+{ return v_float64x4(_mm256_cmp_pd(a.val, a.val, _CMP_ORD_Q)); }
+
 /** min/max **/
 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_min, v_uint8x32,  _mm256_min_epu8)
 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_uint8x32,  _mm256_max_epu8)
diff --git a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
index 5712f167a8..1cfb14ae06 100644
--- a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
@@ -683,6 +683,25 @@ OPENCV_HAL_IMPL_CMP_OP(==)
 For all types except 64-bit integer values. */
 OPENCV_HAL_IMPL_CMP_OP(!=)
 
+template<int n>
+inline v_reg<float, n> v_not_nan(const v_reg<float, n>& a)
+{
+typedef typename V_TypeTraits<float>::int_type itype;
+v_reg<float, n> c;
+for (int i = 0; i < n; i++)
+    c.s[i] = V_TypeTraits<float>::reinterpret_from_int((itype)-(int)(a.s[i] == a.s[i]));
+    return c;
+}
+template<int n>
+inline v_reg<double, n> v_not_nan(const v_reg<double, n>& a)
+{
+    typedef typename V_TypeTraits<double>::int_type itype;
+    v_reg<double, n> c;
+    for (int i = 0; i < n; i++)
+        c.s[i] = V_TypeTraits<double>::reinterpret_from_int((itype)-(int)(a.s[i] == a.s[i]));
+    return c;
+}
+
 //! @brief Helper macro
 //! @ingroup core_hal_intrin_impl
 #define OPENCV_HAL_IMPL_ARITHM_OP(func, bin_op, cast_op, _Tp2) \
diff --git a/modules/core/include/opencv2/core/hal/intrin_neon.hpp b/modules/core/include/opencv2/core/hal/intrin_neon.hpp
index 50c9b154ee..2de4e45283 100644
--- a/modules/core/include/opencv2/core/hal/intrin_neon.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_neon.hpp
@@ -764,6 +764,13 @@ OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int64x2, vreinterpretq_s64_u64, s64, u64)
 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_float64x2, vreinterpretq_f64_u64, f64, u64)
 #endif
 
+inline v_float32x4 v_not_nan(const v_float32x4& a)
+{ return v_float32x4(vreinterpretq_f32_u32(vceqq_f32(a.val, a.val))); }
+#if CV_SIMD128_64F
+inline v_float64x2 v_not_nan(const v_float64x2& a)
+{ return v_float64x2(vreinterpretq_f64_u64(vceqq_f64(a.val, a.val))); }
+#endif
+
 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_add_wrap, vaddq_u8)
 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_add_wrap, vaddq_s8)
 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_add_wrap, vaddq_u16)
diff --git a/modules/core/include/opencv2/core/hal/intrin_sse.hpp b/modules/core/include/opencv2/core/hal/intrin_sse.hpp
index c49d0de377..283c5158d7 100644
--- a/modules/core/include/opencv2/core/hal/intrin_sse.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_sse.hpp
@@ -1041,6 +1041,11 @@ inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
 OPENCV_HAL_IMPL_SSE_64BIT_CMP_OP(v_uint64x2, v_reinterpret_as_u64)
 OPENCV_HAL_IMPL_SSE_64BIT_CMP_OP(v_int64x2, v_reinterpret_as_s64)
 
+inline v_float32x4 v_not_nan(const v_float32x4& a)
+{ return v_float32x4(_mm_cmpord_ps(a.val, a.val)); }
+inline v_float64x2 v_not_nan(const v_float64x2& a)
+{ return v_float64x2(_mm_cmpord_pd(a.val, a.val)); }
+
 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16, v_add_wrap, _mm_add_epi8)
 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int8x16, v_add_wrap, _mm_add_epi8)
 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint16x8, v_add_wrap, _mm_add_epi16)
diff --git a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp
index b23e19950e..fe4a5db5df 100644
--- a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp
@@ -607,6 +607,11 @@ OPENCV_HAL_IMPL_VSX_INT_CMP_OP(v_float64x2)
 OPENCV_HAL_IMPL_VSX_INT_CMP_OP(v_uint64x2)
 OPENCV_HAL_IMPL_VSX_INT_CMP_OP(v_int64x2)
 
+inline v_float32x4 v_not_nan(const v_float32x4& a)
+{ return v_float32x4(vec_cmpeq(a.val, a.val)); }
+inline v_float64x2 v_not_nan(const v_float64x2& a)
+{ return v_float64x2(vec_cmpeq(a.val, a.val)); }
+
 /** min/max **/
 OPENCV_HAL_IMPL_VSX_BIN_FUNC(v_min, vec_min)
 OPENCV_HAL_IMPL_VSX_BIN_FUNC(v_max, vec_max)
diff --git a/modules/imgproc/src/bilateral_filter.cpp b/modules/imgproc/src/bilateral_filter.cpp
index 5e39fa4de5..8678cbfa3d 100644
--- a/modules/imgproc/src/bilateral_filter.cpp
+++ b/modules/imgproc/src/bilateral_filter.cpp
@@ -430,36 +430,44 @@ public:
                     for (; j <= size.width - v_float32::nlanes; j += v_float32::nlanes)
                     {
                         v_float32 val = vx_load(ksptr + j);
-
-                        v_float32 alpha = v_absdiff(val, vx_load(sptr + j)) * sindex;
+                        v_float32 rval = vx_load(sptr + j);
+                        v_float32 knan = v_not_nan(val);
+                        v_float32 alpha = (v_absdiff(val, rval) * sindex) & v_not_nan(rval) & knan;
                         v_int32 idx = v_trunc(alpha);
                         alpha -= v_cvt_f32(idx);
 
-                        v_float32 w = kweight * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one-alpha));
+                        v_float32 w = (kweight * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one-alpha))) & knan;
                         v_store_aligned(wsum + j, vx_load_aligned(wsum + j) + w);
-                        v_store_aligned(sum + j, v_muladd(val, w, vx_load_aligned(sum + j)));
+                        v_store_aligned(sum + j, v_muladd(val & knan, w, vx_load_aligned(sum + j)));
                     }
 #endif
                     for (; j < size.width; j++)
                     {
                         float val = ksptr[j];
-                        float alpha = std::abs(val - sptr[j]) * scale_index;
+                        float rval = sptr[j];
+                        float alpha = std::abs(val - rval) * scale_index;
                         int idx = cvFloor(alpha);
                         alpha -= idx;
-                        float w = space_weight[k] * (expLUT[idx] + alpha*(expLUT[idx+1] - expLUT[idx]));
-                        wsum[j] += w;
-                        sum[j] += val * w;
+                        if (!cvIsNaN(val))
+                        {
+                            float w = space_weight[k] * (cvIsNaN(rval) ? 1.f : (expLUT[idx] + alpha*(expLUT[idx + 1] - expLUT[idx])));
+                            wsum[j] += w;
+                            sum[j] += val * w;
+                        }
                     }
                 }
                 j = 0;
 #if CV_SIMD
                 for (; j <= size.width - v_float32::nlanes; j += v_float32::nlanes)
-                    v_store(dptr + j, vx_load_aligned(sum + j) / vx_load_aligned(wsum + j));
+                {
+                    v_float32 v_val = vx_load(sptr + j);
+                    v_store(dptr + j, (vx_load_aligned(sum + j) + (v_val & v_not_nan(v_val))) / (vx_load_aligned(wsum + j) + (v_one & v_not_nan(v_val))));
+                }
 #endif
                 for (; j < size.width; j++)
                 {
-                    CV_DbgAssert(fabs(wsum[j]) > 0);
-                    dptr[j] = sum[j] / wsum[j];
+                    CV_DbgAssert(fabs(wsum[j]) >= 0);
+                    dptr[j] = cvIsNaN(sptr[j]) ? sum[j] / wsum[j] : (sum[j] + sptr[j]) / (wsum[j] + 1.f);
                 }
             }
             else
@@ -488,45 +496,68 @@ public:
                         v_load_deinterleave(ksptr, kb, kg, kr);
                         v_load_deinterleave(rsptr, rb, rg, rr);
 
-                        v_float32 alpha = (v_absdiff(kb, rb) + v_absdiff(kg, rg) + v_absdiff(kr, rr)) * sindex;
+                        v_float32 knan = v_not_nan(kb) & v_not_nan(kg) & v_not_nan(kr);
+                        v_float32 alpha = ((v_absdiff(kb, rb) + v_absdiff(kg, rg) + v_absdiff(kr, rr)) * sindex) & v_not_nan(rb) & v_not_nan(rg) & v_not_nan(rr) & knan;
                         v_int32 idx = v_trunc(alpha);
                         alpha -= v_cvt_f32(idx);
 
-                        v_float32 w = kweight * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one - alpha));
+                        v_float32 w = (kweight * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one - alpha))) & knan;
                         v_store_aligned(wsum + j, vx_load_aligned(wsum + j) + w);
-                        v_store_aligned(sum_b + j, v_muladd(kb, w, vx_load_aligned(sum_b + j)));
-                        v_store_aligned(sum_g + j, v_muladd(kg, w, vx_load_aligned(sum_g + j)));
-                        v_store_aligned(sum_r + j, v_muladd(kr, w, vx_load_aligned(sum_r + j)));
+                        v_store_aligned(sum_b + j, v_muladd(kb & knan, w, vx_load_aligned(sum_b + j)));
+                        v_store_aligned(sum_g + j, v_muladd(kg & knan, w, vx_load_aligned(sum_g + j)));
+                        v_store_aligned(sum_r + j, v_muladd(kr & knan, w, vx_load_aligned(sum_r + j)));
                     }
 #endif
                     for (; j < size.width; j++, ksptr += 3, rsptr += 3)
                     {
                         float b = ksptr[0], g = ksptr[1], r = ksptr[2];
-                        float alpha = (std::abs(b - rsptr[0]) + std::abs(g - rsptr[1]) + std::abs(r - rsptr[2])) * scale_index;
+                        bool v_NAN = cvIsNaN(b) || cvIsNaN(g) || cvIsNaN(r);
+                        float rb = rsptr[0], rg = rsptr[1], rr = rsptr[2];
+                        bool r_NAN = cvIsNaN(rb) || cvIsNaN(rg) || cvIsNaN(rr);
+                        float alpha = (std::abs(b - rb) + std::abs(g - rg) + std::abs(r - rr)) * scale_index;
                         int idx = cvFloor(alpha);
                         alpha -= idx;
-                        float w = space_weight[k] * (expLUT[idx] + alpha*(expLUT[idx + 1] - expLUT[idx]));
-                        wsum[j] += w;
-                        sum_b[j] += b*w;
-                        sum_g[j] += g*w;
-                        sum_r[j] += r*w;
+                        if (!v_NAN)
+                        {
+                            float w = space_weight[k] * (r_NAN ? 1.f : (expLUT[idx] + alpha*(expLUT[idx + 1] - expLUT[idx])));
+                            wsum[j] += w;
+                            sum_b[j] += b*w;
+                            sum_g[j] += g*w;
+                            sum_r[j] += r*w;
+                        }
                     }
                 }
                 j = 0;
 #if CV_SIMD
-                for (; j <= size.width - v_float32::nlanes; j += v_float32::nlanes, dptr += 3*v_float32::nlanes)
+                for (; j <= size.width - v_float32::nlanes; j += v_float32::nlanes, sptr += 3*v_float32::nlanes, dptr += 3*v_float32::nlanes)
                 {
-                    v_float32 w = v_one / vx_load_aligned(wsum + j);
-                    v_store_interleave(dptr, vx_load_aligned(sum_b + j) * w, vx_load_aligned(sum_g + j) * w, vx_load_aligned(sum_r + j) * w);
+                    v_float32 b, g, r;
+                    v_load_deinterleave(sptr, b, g, r);
+                    v_float32 mask = v_not_nan(b) & v_not_nan(g) & v_not_nan(r);
+                    v_float32 w = v_one / (vx_load_aligned(wsum + j) + (v_one & mask));
+                    v_store_interleave(dptr, (vx_load_aligned(sum_b + j) + (b & mask)) * w, (vx_load_aligned(sum_g + j) + (g & mask)) * w, (vx_load_aligned(sum_r + j) + (r & mask)) * w);
                 }
 #endif
                 for (; j < size.width; j++)
                 {
-                    CV_DbgAssert(fabs(wsum[j]) > 0);
-                    wsum[j] = 1.f / wsum[j];
-                    *(dptr++) = sum_b[j] * wsum[j];
-                    *(dptr++) = sum_g[j] * wsum[j];
-                    *(dptr++) = sum_r[j] * wsum[j];
+                    CV_DbgAssert(fabs(wsum[j]) >= 0);
+                    float b = *(sptr++);
+                    float g = *(sptr++);
+                    float r = *(sptr++);
+                    if (cvIsNaN(b) || cvIsNaN(g) || cvIsNaN(r))
+                    {
+                        wsum[j] = 1.f / wsum[j];
+                        *(dptr++) = sum_b[j] * wsum[j];
+                        *(dptr++) = sum_g[j] * wsum[j];
+                        *(dptr++) = sum_r[j] * wsum[j];
+                    }
+                    else
+                    {
+                        wsum[j] = 1.f / (wsum[j] + 1.f);
+                        *(dptr++) = (sum_b[j] + b) * wsum[j];
+                        *(dptr++) = (sum_g[j] + g) * wsum[j];
+                        *(dptr++) = (sum_r[j] + r) * wsum[j];
+                    }
                 }
             }
         }
@@ -585,9 +616,7 @@ bilateralFilter_32f( const Mat& src, Mat& dst, int d,
     // temporary copy of the image with borders for easy processing
     Mat temp;
     copyMakeBorder( src, temp, radius, radius, radius, radius, borderType );
-    minValSrc -= 5. * sigma_color;
-    patchNaNs( temp, minValSrc ); // this replacement of NaNs makes the assumption that depth values are nonnegative
-                                  // TODO: make replacement parameter avalible in the outside function interface
+
     // allocate lookup tables
     std::vector<float> _space_weight(d*d);
     std::vector<int> _space_ofs(d*d);
@@ -620,7 +649,7 @@ bilateralFilter_32f( const Mat& src, Mat& dst, int d,
         for( j = -radius; j <= radius; j++ )
         {
             double r = std::sqrt((double)i*i + (double)j*j);
-            if( r > radius )
+            if( r > radius || ( i == 0 && j == 0 ) )
                 continue;
             space_weight[maxk] = (float)std::exp(r*r*gauss_space_coeff);
             space_ofs[maxk++] = (int)(i*(temp.step/sizeof(float)) + j*cn);