From 28fd967148920ead4b55de4df3354c76bedf3cb3 Mon Sep 17 00:00:00 2001
From: Vitaly Tuzov <terfendail@mediana.jetos.com>
Date: Fri, 9 Nov 2018 15:27:30 +0300
Subject: [PATCH 1/2] Updated bilateralFilter implementations to use wide
 universal intrinsics

---
 .../include/opencv2/core/hal/intrin_avx.hpp   |  19 +-
 modules/imgproc/src/smooth.cpp                | 492 ++++++++----------
 2 files changed, 230 insertions(+), 281 deletions(-)
diff --git a/modules/core/include/opencv2/core/hal/intrin_avx.hpp b/modules/core/include/opencv2/core/hal/intrin_avx.hpp
index 58b3e7fae7..f8cc7a4d00 100644
--- a/modules/core/include/opencv2/core/hal/intrin_avx.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_avx.hpp
@@ -1363,25 +1363,22 @@ inline v_float64x4 v_cvt_f64_high(const v_float32x8& a)
 
 inline v_int32x8 v_lut(const int* tab, const v_int32x8& idxvec)
 {
-    int CV_DECL_ALIGNED(32) idx[8];
-    v_store_aligned(idx, idxvec);
-    return v_int32x8(_mm256_setr_epi32(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]],
-                                       tab[idx[4]], tab[idx[5]], tab[idx[6]], tab[idx[7]]));
+    return v_int32x8(_mm256_i32gather_epi32(tab, idxvec.val, 4));
+}
+
+inline v_uint32x8 v_lut(const unsigned* tab, const v_int32x8& idxvec)
+{
+    return v_reinterpret_as_u32(v_lut((const int *)tab, idxvec));
 }
 
 inline v_float32x8 v_lut(const float* tab, const v_int32x8& idxvec)
 {
-    int CV_DECL_ALIGNED(32) idx[8];
-    v_store_aligned(idx, idxvec);
-    return v_float32x8(_mm256_setr_ps(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]],
-                                      tab[idx[4]], tab[idx[5]], tab[idx[6]], tab[idx[7]]));
+    return v_float32x8(_mm256_i32gather_ps(tab, idxvec.val, 4));
 }
 
 inline v_float64x4 v_lut(const double* tab, const v_int32x8& idxvec)
 {
-    int CV_DECL_ALIGNED(32) idx[8];
-    v_store_aligned(idx, idxvec);
-    return v_float64x4(_mm256_setr_pd(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]));
+    return v_float64x4(_mm256_i32gather_pd(tab, _mm256_castsi256_si128(idxvec.val), 8));
 }
 
 inline void v_lut_deinterleave(const float* tab, const v_int32x8& idxvec, v_float32x8& x, v_float32x8& y)
diff --git a/modules/imgproc/src/smooth.cpp b/modules/imgproc/src/smooth.cpp
index a4292b6089..e2fe6a58d5 100644
--- a/modules/imgproc/src/smooth.cpp
+++ b/modules/imgproc/src/smooth.cpp
@@ -2527,10 +2527,6 @@ public:
     {
         int i, j, cn = dest->channels(), k;
         Size size = dest->size();
-#if CV_SIMD128
-        int CV_DECL_ALIGNED(16) buf[4];
-        bool haveSIMD128 = hasSIMD128();
-#endif
 
         for( i = range.start; i < range.end; i++ )
         {
@@ -2539,154 +2535,153 @@ public:
 
             if( cn == 1 )
             {
-                for( j = 0; j < size.width; j++ )
+                AutoBuffer<float> buf(alignSize(size.width, CV_SIMD_WIDTH) + size.width + CV_SIMD_WIDTH - 1);
+                memset(buf.data(), 0, buf.size() * sizeof(float));
+                float *sum = alignPtr(buf.data(), CV_SIMD_WIDTH);
+                float *wsum = sum + alignSize(size.width, CV_SIMD_WIDTH);
+                for( k = 0; k < maxk; k++ )
                 {
-                    float sum = 0, wsum = 0;
-                    int val0 = sptr[j];
-                    k = 0;
-#if CV_SIMD128
-                    if( haveSIMD128 )
+                    const uchar* ksptr = sptr + space_ofs[k];
+                    j = 0;
+#if CV_SIMD
+                    v_float32 kweight = vx_setall_f32(space_weight[k]);
+                    for (; j <= size.width - v_float32::nlanes; j += v_float32::nlanes)
                     {
-                        v_float32x4 _val0 = v_setall_f32(static_cast<float>(val0));
-                        v_float32x4 vsumw = v_setzero_f32();
-                        v_float32x4 vsumc = v_setzero_f32();
-
-                        for( ; k <= maxk - 4; k += 4 )
-                        {
-                            v_float32x4 _valF = v_float32x4(sptr[j + space_ofs[k]],
-                                sptr[j + space_ofs[k + 1]],
-                                sptr[j + space_ofs[k + 2]],
-                                sptr[j + space_ofs[k + 3]]);
-                            v_float32x4 _val = v_abs(_valF - _val0);
-                            v_store(buf, v_round(_val));
-
-                            v_float32x4 _cw = v_float32x4(color_weight[buf[0]],
-                                color_weight[buf[1]],
-                                color_weight[buf[2]],
-                                color_weight[buf[3]]);
-                            v_float32x4 _sw = v_load(space_weight+k);
-#if defined(_MSC_VER) && _MSC_VER == 1700/* MSVS 2012 */ && CV_AVX
-                            // details: https://github.com/opencv/opencv/issues/11004
-                            vsumw += _cw * _sw;
-                            vsumc += _cw * _sw * _valF;
-#else
-                            v_float32x4 _w = _cw * _sw;
-                            _cw = _w * _valF;
-
-                            vsumw += _w;
-                            vsumc += _cw;
-#endif
-                        }
-                        float *bufFloat = (float*)buf;
-                        v_float32x4 sum4 = v_reduce_sum4(vsumw, vsumc, vsumw, vsumc);
-                        v_store(bufFloat, sum4);
-                        sum += bufFloat[1];
-                        wsum += bufFloat[0];
+                        v_uint32 val = vx_load_expand_q(ksptr + j);
+                        v_float32 w = kweight * v_lut(color_weight, v_reinterpret_as_s32(v_absdiff(val, vx_load_expand_q(sptr + j))));
+                        v_store_aligned(wsum + j, vx_load_aligned(wsum + j) + w);
+                        v_store_aligned(sum + j, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val)), w, vx_load_aligned(sum + j)));
                     }
 #endif
-                    for( ; k < maxk; k++ )
+                    for (; j < size.width; j++)
                     {
-                        int val = sptr[j + space_ofs[k]];
-                        float w = space_weight[k]*color_weight[std::abs(val - val0)];
-                        sum += val*w;
-                        wsum += w;
+                        int val = ksptr[j];
+                        float w = space_weight[k] * color_weight[std::abs(val - sptr[j])];
+                        wsum[j] += w;
+                        sum[j] += val * w;
                     }
+                }
+                j = 0;
+#if CV_SIMD
+                for (; j <= size.width - 2*v_float32::nlanes; j += 2*v_float32::nlanes)
+                    v_pack_u_store(dptr + j, v_pack(v_round(vx_load_aligned(sum + j                    ) / vx_load_aligned(wsum + j                    )),
+                                                    v_round(vx_load_aligned(sum + j + v_float32::nlanes) / vx_load_aligned(wsum + j + v_float32::nlanes))));
+#endif
+                for (; j < size.width; j++)
+                {
                     // overflow is not possible here => there is no need to use cv::saturate_cast
-                    CV_DbgAssert(fabs(wsum) > 0);
-                    dptr[j] = (uchar)cvRound(sum/wsum);
+                    CV_DbgAssert(fabs(wsum[j]) > 0);
+                    dptr[j] = (uchar)cvRound(sum[j]/wsum[j]);
                 }
             }
             else
             {
                 assert( cn == 3 );
-                for( j = 0; j < size.width*3; j += 3 )
+                AutoBuffer<float> buf(alignSize(size.width, CV_SIMD_WIDTH)*3 + size.width + CV_SIMD_WIDTH - 1);
+                memset(buf.data(), 0, buf.size() * sizeof(float));
+                float *sum_b = alignPtr(buf.data(), CV_SIMD_WIDTH);
+                float *sum_g = sum_b + alignSize(size.width, CV_SIMD_WIDTH);
+                float *sum_r = sum_g + alignSize(size.width, CV_SIMD_WIDTH);
+                float *wsum = sum_r + alignSize(size.width, CV_SIMD_WIDTH);
+                for(k = 0; k < maxk; k++ )
                 {
-                    float sum_b = 0, sum_g = 0, sum_r = 0, wsum = 0;
-                    int b0 = sptr[j], g0 = sptr[j+1], r0 = sptr[j+2];
-                    k = 0;
-#if CV_SIMD128
-                    if( haveSIMD128 )
+                    const uchar* ksptr = sptr + space_ofs[k];
+                    const uchar* rsptr = sptr;
+                    j = 0;
+#if CV_SIMD
+                    v_float32 kweight = vx_setall_f32(space_weight[k]);
+                    for (; j <= size.width - v_uint8::nlanes; j += v_uint8::nlanes, ksptr += 3*v_uint8::nlanes, rsptr += 3*v_uint8::nlanes)
                     {
-                        v_float32x4 vsumw = v_setzero_f32();
-                        v_float32x4 vsumb = v_setzero_f32();
-                        v_float32x4 vsumg = v_setzero_f32();
-                        v_float32x4 vsumr = v_setzero_f32();
-                        const v_float32x4 _b0 = v_setall_f32(static_cast<float>(b0));
-                        const v_float32x4 _g0 = v_setall_f32(static_cast<float>(g0));
-                        const v_float32x4 _r0 = v_setall_f32(static_cast<float>(r0));
-
-                        for( ; k <= maxk - 4; k += 4 )
-                        {
-                            const uchar* const sptr_k0  = sptr + j + space_ofs[k];
-                            const uchar* const sptr_k1  = sptr + j + space_ofs[k+1];
-                            const uchar* const sptr_k2  = sptr + j + space_ofs[k+2];
-                            const uchar* const sptr_k3  = sptr + j + space_ofs[k+3];
-
-                            v_float32x4 __b = v_cvt_f32(v_reinterpret_as_s32(v_load_expand_q(sptr_k0)));
-                            v_float32x4 __g = v_cvt_f32(v_reinterpret_as_s32(v_load_expand_q(sptr_k1)));
-                            v_float32x4 __r = v_cvt_f32(v_reinterpret_as_s32(v_load_expand_q(sptr_k2)));
-                            v_float32x4 __z = v_cvt_f32(v_reinterpret_as_s32(v_load_expand_q(sptr_k3)));
-                            v_float32x4 _b, _g, _r, _z;
-
-                            v_transpose4x4(__b, __g, __r, __z, _b, _g, _r, _z);
-
-                            v_float32x4 bt = v_abs(_b -_b0);
-                            v_float32x4 gt = v_abs(_g -_g0);
-                            v_float32x4 rt = v_abs(_r -_r0);
-
-                            bt = rt + bt + gt;
-                            v_store(buf, v_round(bt));
-
-                            v_float32x4 _w  = v_float32x4(color_weight[buf[0]],color_weight[buf[1]],
-                                                    color_weight[buf[2]],color_weight[buf[3]]);
-                            v_float32x4 _sw = v_load(space_weight+k);
-
-#if defined(_MSC_VER) && _MSC_VER == 1700/* MSVS 2012 */ && CV_AVX
-                            // details: https://github.com/opencv/opencv/issues/11004
-                            vsumw += _w * _sw;
-                            vsumb += _w * _sw * _b;
-                            vsumg += _w * _sw * _g;
-                            vsumr += _w * _sw * _r;
-#else
-                            _w *= _sw;
-                            _b *=  _w;
-                            _g *=  _w;
-                            _r *=  _w;
-
-                            vsumw += _w;
-                            vsumb += _b;
-                            vsumg += _g;
-                            vsumr += _r;
-#endif
-                        }
-                        float *bufFloat = (float*)buf;
-                        v_float32x4 sum4 = v_reduce_sum4(vsumw, vsumb, vsumg, vsumr);
-                        v_store(bufFloat, sum4);
-                        wsum += bufFloat[0];
-                        sum_b += bufFloat[1];
-                        sum_g += bufFloat[2];
-                        sum_r += bufFloat[3];
+                        v_uint8 kb, kg, kr, rb, rg, rr;
+                        v_load_deinterleave(ksptr, kb, kg, kr);
+                        v_load_deinterleave(rsptr, rb, rg, rr);
+
+                        v_uint16 b_l, b_h, g_l, g_h, r_l, r_h;
+                        v_expand(v_absdiff(kb, rb), b_l, b_h);
+                        v_expand(v_absdiff(kg, rg), g_l, g_h);
+                        v_expand(v_absdiff(kr, rr), r_l, r_h);
+
+                        v_uint32 val0, val1, val2, val3;
+                        v_expand(b_l + g_l + r_l, val0, val1);
+                        v_expand(b_h + g_h + r_h, val2, val3);
+
+                        v_expand(kb, b_l, b_h);
+                        v_expand(kg, g_l, g_h);
+                        v_expand(kr, r_l, r_h);
+
+                        v_float32 w0 = kweight * v_lut(color_weight, v_reinterpret_as_s32(val0));
+                        v_float32 w1 = kweight * v_lut(color_weight, v_reinterpret_as_s32(val1));
+                        v_float32 w2 = kweight * v_lut(color_weight, v_reinterpret_as_s32(val2));
+                        v_float32 w3 = kweight * v_lut(color_weight, v_reinterpret_as_s32(val3));
+                        v_store_aligned(wsum + j                      , w0 + vx_load_aligned(wsum + j));
+                        v_store_aligned(wsum + j +   v_float32::nlanes, w1 + vx_load_aligned(wsum + j + v_float32::nlanes));
+                        v_store_aligned(wsum + j + 2*v_float32::nlanes, w2 + vx_load_aligned(wsum + j + 2*v_float32::nlanes));
+                        v_store_aligned(wsum + j + 3*v_float32::nlanes, w3 + vx_load_aligned(wsum + j + 3*v_float32::nlanes));
+                        v_expand(b_l, val0, val1);
+                        v_expand(b_h, val2, val3);
+                        v_store_aligned(sum_b + j                      , v_muladd(v_cvt_f32(v_reinterpret_as_s32(val0)), w0, vx_load_aligned(sum_b + j)));
+                        v_store_aligned(sum_b + j +   v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val1)), w1, vx_load_aligned(sum_b + j + v_float32::nlanes)));
+                        v_store_aligned(sum_b + j + 2*v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val2)), w2, vx_load_aligned(sum_b + j + 2*v_float32::nlanes)));
+                        v_store_aligned(sum_b + j + 3*v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val3)), w3, vx_load_aligned(sum_b + j + 3*v_float32::nlanes)));
+                        v_expand(g_l, val0, val1);
+                        v_expand(g_h, val2, val3);
+                        v_store_aligned(sum_g + j                      , v_muladd(v_cvt_f32(v_reinterpret_as_s32(val0)), w0, vx_load_aligned(sum_g + j)));
+                        v_store_aligned(sum_g + j +   v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val1)), w1, vx_load_aligned(sum_g + j + v_float32::nlanes)));
+                        v_store_aligned(sum_g + j + 2*v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val2)), w2, vx_load_aligned(sum_g + j + 2*v_float32::nlanes)));
+                        v_store_aligned(sum_g + j + 3*v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val3)), w3, vx_load_aligned(sum_g + j + 3*v_float32::nlanes)));
+                        v_expand(r_l, val0, val1);
+                        v_expand(r_h, val2, val3);
+                        v_store_aligned(sum_r + j                      , v_muladd(v_cvt_f32(v_reinterpret_as_s32(val0)), w0, vx_load_aligned(sum_r + j)));
+                        v_store_aligned(sum_r + j +   v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val1)), w1, vx_load_aligned(sum_r + j + v_float32::nlanes)));
+                        v_store_aligned(sum_r + j + 2*v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val2)), w2, vx_load_aligned(sum_r + j + 2*v_float32::nlanes)));
+                        v_store_aligned(sum_r + j + 3*v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val3)), w3, vx_load_aligned(sum_r + j + 3*v_float32::nlanes)));
                     }
 #endif
-
-                    for( ; k < maxk; k++ )
+                    for(; j < size.width; j++, ksptr += 3, rsptr += 3)
                     {
-                        const uchar* sptr_k = sptr + j + space_ofs[k];
-                        int b = sptr_k[0], g = sptr_k[1], r = sptr_k[2];
-                        float w = space_weight[k]*color_weight[std::abs(b - b0) +
-                                                               std::abs(g - g0) + std::abs(r - r0)];
-                        sum_b += b*w; sum_g += g*w; sum_r += r*w;
-                        wsum += w;
+                        int b = ksptr[0], g = ksptr[1], r = ksptr[2];
+                        float w = space_weight[k]*color_weight[std::abs(b - rsptr[0]) + std::abs(g - rsptr[1]) + std::abs(r - rsptr[2])];
+                        wsum[j] += w;
+                        sum_b[j] += b*w; sum_g[j] += g*w; sum_r[j] += r*w;
                     }
-                    CV_DbgAssert(fabs(wsum) > 0);
-                    wsum = 1.f/wsum;
-                    b0 = cvRound(sum_b*wsum);
-                    g0 = cvRound(sum_g*wsum);
-                    r0 = cvRound(sum_r*wsum);
-                    dptr[j] = (uchar)b0; dptr[j+1] = (uchar)g0; dptr[j+2] = (uchar)r0;
+                }
+                j = 0;
+#if CV_SIMD
+                v_float32 v_one = vx_setall_f32(1.f);
+                for(; j <= size.width - v_uint8::nlanes; j += v_uint8::nlanes, dptr += 3*v_uint8::nlanes)
+                {
+                    v_float32 w0 = v_one / vx_load_aligned(wsum + j);
+                    v_float32 w1 = v_one / vx_load_aligned(wsum + j + v_float32::nlanes);
+                    v_float32 w2 = v_one / vx_load_aligned(wsum + j + 2*v_float32::nlanes);
+                    v_float32 w3 = v_one / vx_load_aligned(wsum + j + 3*v_float32::nlanes);
+
+                    v_store_interleave(dptr, v_pack_u(v_pack(v_round(w0 * vx_load_aligned(sum_b + j)),
+                                                             v_round(w1 * vx_load_aligned(sum_b + j + v_float32::nlanes))),
+                                                      v_pack(v_round(w2 * vx_load_aligned(sum_b + j + 2*v_float32::nlanes)),
+                                                             v_round(w3 * vx_load_aligned(sum_b + j + 3*v_float32::nlanes)))),
+                                             v_pack_u(v_pack(v_round(w0 * vx_load_aligned(sum_g + j)),
+                                                             v_round(w1 * vx_load_aligned(sum_g + j + v_float32::nlanes))),
+                                                      v_pack(v_round(w2 * vx_load_aligned(sum_g + j + 2*v_float32::nlanes)),
+                                                             v_round(w3 * vx_load_aligned(sum_g + j + 3*v_float32::nlanes)))),
+                                             v_pack_u(v_pack(v_round(w0 * vx_load_aligned(sum_r + j)),
+                                                             v_round(w1 * vx_load_aligned(sum_r + j + v_float32::nlanes))),
+                                                      v_pack(v_round(w2 * vx_load_aligned(sum_r + j + 2*v_float32::nlanes)),
+                                                             v_round(w3 * vx_load_aligned(sum_r + j + 3*v_float32::nlanes)))));
+                }
+#endif
+                for(; j < size.width; j++)
+                {
+                    CV_DbgAssert(fabs(wsum[j]) > 0);
+                    wsum[j] = 1.f/wsum[j];
+                    *(dptr++) = (uchar)cvRound(sum_b[j]*wsum[j]);
+                    *(dptr++) = (uchar)cvRound(sum_g[j]*wsum[j]);
+                    *(dptr++) = (uchar)cvRound(sum_r[j]*wsum[j]);
                 }
             }
         }
+#if CV_SIMD
+        vx_cleanup();
+#endif
     }
 
 private:
@@ -2867,10 +2862,6 @@ public:
     {
         int i, j, k;
         Size size = dest->size();
-#if CV_SIMD128
-        int CV_DECL_ALIGNED(16) idxBuf[4];
-        bool haveSIMD128 = hasSIMD128();
-#endif
 
         for( i = range.start; i < range.end; i++ )
         {
@@ -2879,165 +2870,126 @@ public:
 
             if( cn == 1 )
             {
-                for( j = 0; j < size.width; j++ )
+                AutoBuffer<float> buf(alignSize(size.width, CV_SIMD_WIDTH) + size.width + CV_SIMD_WIDTH - 1);
+                memset(buf.data(), 0, buf.size() * sizeof(float));
+                float *sum = alignPtr(buf.data(), CV_SIMD_WIDTH);
+                float *wsum = sum + alignSize(size.width, CV_SIMD_WIDTH);
+#if CV_SIMD
+                v_float32 v_one = vx_setall_f32(1.f);
+                v_float32 sindex = vx_setall_f32(scale_index);
+#endif
+                for( k = 0; k < maxk; k++ )
                 {
-                    float sum = 0, wsum = 0;
-                    float val0 = sptr[j];
-                    k = 0;
-#if CV_SIMD128
-                    if( haveSIMD128 )
+                    const float* ksptr = sptr + space_ofs[k];
+                    j = 0;
+#if CV_SIMD
+                    v_float32 kweight = vx_setall_f32(space_weight[k]);
+                    for (; j <= size.width - v_float32::nlanes; j += v_float32::nlanes)
                     {
-                        v_float32x4 vecwsum = v_setzero_f32();
-                        v_float32x4 vecvsum = v_setzero_f32();
-                        const v_float32x4 _val0 = v_setall_f32(sptr[j]);
-                        const v_float32x4 _scale_index = v_setall_f32(scale_index);
-
-                        for (; k <= maxk - 4; k += 4)
-                        {
-                            v_float32x4 _sw = v_load(space_weight + k);
-                            v_float32x4 _val = v_float32x4(sptr[j + space_ofs[k]],
-                                sptr[j + space_ofs[k + 1]],
-                                sptr[j + space_ofs[k + 2]],
-                                sptr[j + space_ofs[k + 3]]);
-                            v_float32x4 _alpha = v_abs(_val - _val0) * _scale_index;
-
-                            v_int32x4 _idx = v_round(_alpha);
-                            v_store(idxBuf, _idx);
-                            _alpha -= v_cvt_f32(_idx);
-
-                            v_float32x4 _explut = v_float32x4(expLUT[idxBuf[0]],
-                                expLUT[idxBuf[1]],
-                                expLUT[idxBuf[2]],
-                                expLUT[idxBuf[3]]);
-                            v_float32x4 _explut1 = v_float32x4(expLUT[idxBuf[0] + 1],
-                                expLUT[idxBuf[1] + 1],
-                                expLUT[idxBuf[2] + 1],
-                                expLUT[idxBuf[3] + 1]);
-
-                            v_float32x4 _w = _sw * (_explut + (_alpha * (_explut1 - _explut)));
-                            _val *= _w;
-
-                            vecwsum += _w;
-                            vecvsum += _val;
-                        }
-                        float *bufFloat = (float*)idxBuf;
-                        v_float32x4 sum4 = v_reduce_sum4(vecwsum, vecvsum, vecwsum, vecvsum);
-                        v_store(bufFloat, sum4);
-                        sum += bufFloat[1];
-                        wsum += bufFloat[0];
+                        v_float32 val = vx_load(ksptr + j);
+
+                        v_float32 alpha = v_absdiff(val, vx_load(sptr + j)) * sindex;
+                        v_int32 idx = v_trunc(alpha);
+                        alpha -= v_cvt_f32(idx);
+
+                        v_float32 w = kweight * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one-alpha));
+                        v_store_aligned(wsum + j, vx_load_aligned(wsum + j) + w);
+                        v_store_aligned(sum + j, v_muladd(val, w, vx_load_aligned(sum + j)));
                     }
 #endif
-
-                    for( ; k < maxk; k++ )
+                    for (; j < size.width; j++)
                     {
-                        float val = sptr[j + space_ofs[k]];
-                        float alpha = (float)(std::abs(val - val0)*scale_index);
+                        float val = ksptr[j];
+                        float alpha = std::abs(val - sptr[j]) * scale_index;
                         int idx = cvFloor(alpha);
                         alpha -= idx;
-                        float w = space_weight[k]*(expLUT[idx] + alpha*(expLUT[idx+1] - expLUT[idx]));
-                        sum += val*w;
-                        wsum += w;
+                        float w = space_weight[k] * (expLUT[idx] + alpha*(expLUT[idx+1] - expLUT[idx]));
+                        wsum[j] += w;
+                        sum[j] += val * w;
                     }
-                    CV_DbgAssert(fabs(wsum) > 0);
-                    dptr[j] = (float)(sum/wsum);
+                }
+                j = 0;
+#if CV_SIMD
+                for (; j <= size.width - v_float32::nlanes; j += v_float32::nlanes)
+                    v_store(dptr + j, vx_load_aligned(sum + j) / vx_load_aligned(wsum + j));
+#endif
+                for (; j < size.width; j++)
+                {
+                    CV_DbgAssert(fabs(wsum[j]) > 0);
+                    dptr[j] = sum[j] / wsum[j];
                 }
             }
             else
             {
                 CV_Assert( cn == 3 );
-                for( j = 0; j < size.width*3; j += 3 )
+                AutoBuffer<float> buf(alignSize(size.width, CV_SIMD_WIDTH)*3 + size.width + CV_SIMD_WIDTH - 1);
+                memset(buf.data(), 0, buf.size() * sizeof(float));
+                float *sum_b = alignPtr(buf.data(), CV_SIMD_WIDTH);
+                float *sum_g = sum_b + alignSize(size.width, CV_SIMD_WIDTH);
+                float *sum_r = sum_g + alignSize(size.width, CV_SIMD_WIDTH);
+                float *wsum = sum_r + alignSize(size.width, CV_SIMD_WIDTH);
+#if CV_SIMD
+                v_float32 v_one = vx_setall_f32(1.f);
+                v_float32 sindex = vx_setall_f32(scale_index);
+#endif
+                for (k = 0; k < maxk; k++)
                 {
-                    float sum_b = 0, sum_g = 0, sum_r = 0, wsum = 0;
-                    float b0 = sptr[j], g0 = sptr[j+1], r0 = sptr[j+2];
-                    k = 0;
-#if CV_SIMD128
-                    if( haveSIMD128 )
+                    const float* ksptr = sptr + space_ofs[k];
+                    const float* rsptr = sptr;
+                    j = 0;
+#if CV_SIMD
+                    v_float32 kweight = vx_setall_f32(space_weight[k]);
+                    for (; j <= size.width - v_float32::nlanes; j += v_float32::nlanes, ksptr += 3*v_float32::nlanes, rsptr += 3*v_float32::nlanes)
                     {
-                        v_float32x4 sumw = v_setzero_f32();
-                        v_float32x4 sumb = v_setzero_f32();
-                        v_float32x4 sumg = v_setzero_f32();
-                        v_float32x4 sumr = v_setzero_f32();
-                        const v_float32x4 _b0 = v_setall_f32(b0);
-                        const v_float32x4 _g0 = v_setall_f32(g0);
-                        const v_float32x4 _r0 = v_setall_f32(r0);
-                        const v_float32x4 _scale_index = v_setall_f32(scale_index);
-
-                        for( ; k <= maxk-4; k += 4 )
-                        {
-                            v_float32x4 _sw = v_load(space_weight + k);
-
-                            const float* const sptr_k0 = sptr + j + space_ofs[k];
-                            const float* const sptr_k1 = sptr + j + space_ofs[k+1];
-                            const float* const sptr_k2 = sptr + j + space_ofs[k+2];
-                            const float* const sptr_k3 = sptr + j + space_ofs[k+3];
-
-                            v_float32x4 _v0 = v_load(sptr_k0);
-                            v_float32x4 _v1 = v_load(sptr_k1);
-                            v_float32x4 _v2 = v_load(sptr_k2);
-                            v_float32x4 _v3 = v_load(sptr_k3);
-                            v_float32x4 _b, _g, _r, _dummy;
-
-                            v_transpose4x4(_v0, _v1, _v2, _v3, _b, _g, _r, _dummy);
-
-                            v_float32x4 _bt = v_abs(_b - _b0);
-                            v_float32x4 _gt = v_abs(_g - _g0);
-                            v_float32x4 _rt = v_abs(_r - _r0);
-                            v_float32x4 _alpha = _scale_index * (_bt + _gt + _rt);
-
-                            v_int32x4 _idx = v_round(_alpha);
-                            v_store((int*)idxBuf, _idx);
-                            _alpha -= v_cvt_f32(_idx);
-
-                            v_float32x4 _explut = v_float32x4(expLUT[idxBuf[0]],
-                                expLUT[idxBuf[1]],
-                                expLUT[idxBuf[2]],
-                                expLUT[idxBuf[3]]);
-                            v_float32x4 _explut1 = v_float32x4(expLUT[idxBuf[0] + 1],
-                                expLUT[idxBuf[1] + 1],
-                                expLUT[idxBuf[2] + 1],
-                                expLUT[idxBuf[3] + 1]);
-
-                            v_float32x4 _w = _sw * (_explut + (_alpha * (_explut1 - _explut)));
-
-                            _b *=  _w;
-                            _g *=  _w;
-                            _r *=  _w;
-                            sumw += _w;
-                            sumb += _b;
-                            sumg += _g;
-                            sumr += _r;
-                        }
-                        v_float32x4 sum4 = v_reduce_sum4(sumw, sumb, sumg, sumr);
-                        float *bufFloat = (float*)idxBuf;
-                        v_store(bufFloat, sum4);
-                        wsum += bufFloat[0];
-                        sum_b += bufFloat[1];
-                        sum_g += bufFloat[2];
-                        sum_r += bufFloat[3];
+                        v_float32 kb, kg, kr, rb, rg, rr;
+                        v_load_deinterleave(ksptr, kb, kg, kr);
+                        v_load_deinterleave(rsptr, rb, rg, rr);
+
+                        v_float32 alpha = (v_absdiff(kb, rb) + v_absdiff(kg, rg) + v_absdiff(kr, rr)) * sindex;
+                        v_int32 idx = v_trunc(alpha);
+                        alpha -= v_cvt_f32(idx);
+
+                        v_float32 w = kweight * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one - alpha));
+                        v_store_aligned(wsum + j, vx_load_aligned(wsum + j) + w);
+                        v_store_aligned(sum_b + j, v_muladd(kb, w, vx_load_aligned(sum_b + j)));
+                        v_store_aligned(sum_g + j, v_muladd(kg, w, vx_load_aligned(sum_g + j)));
+                        v_store_aligned(sum_r + j, v_muladd(kr, w, vx_load_aligned(sum_r + j)));
                     }
 #endif
-
-                    for(; k < maxk; k++ )
+                    for (; j < size.width; j++, ksptr += 3, rsptr += 3)
                     {
-                        const float* sptr_k = sptr + j + space_ofs[k];
-                        float b = sptr_k[0], g = sptr_k[1], r = sptr_k[2];
-                        float alpha = (float)((std::abs(b - b0) +
-                            std::abs(g - g0) + std::abs(r - r0))*scale_index);
+                        float b = ksptr[0], g = ksptr[1], r = ksptr[2];
+                        float alpha = (std::abs(b - rsptr[0]) + std::abs(g - rsptr[1]) + std::abs(r - rsptr[2])) * scale_index;
                         int idx = cvFloor(alpha);
                         alpha -= idx;
-                        float w = space_weight[k]*(expLUT[idx] + alpha*(expLUT[idx+1] - expLUT[idx]));
-                        sum_b += b*w; sum_g += g*w; sum_r += r*w;
-                        wsum += w;
+                        float w = space_weight[k] * (expLUT[idx] + alpha*(expLUT[idx + 1] - expLUT[idx]));
+                        wsum[j] += w;
+                        sum_b[j] += b*w;
+                        sum_g[j] += g*w;
+                        sum_r[j] += r*w;
                     }
-                    CV_DbgAssert(fabs(wsum) > 0);
-                    wsum = 1.f/wsum;
-                    b0 = sum_b*wsum;
-                    g0 = sum_g*wsum;
-                    r0 = sum_r*wsum;
-                    dptr[j] = b0; dptr[j+1] = g0; dptr[j+2] = r0;
+                }
+                j = 0;
+#if CV_SIMD
+                for (; j <= size.width - v_float32::nlanes; j += v_float32::nlanes, dptr += 3*v_float32::nlanes)
+                {
+                    v_float32 w = v_one / vx_load_aligned(wsum + j);
+                    v_store_interleave(dptr, vx_load_aligned(sum_b + j) * w, vx_load_aligned(sum_g + j) * w, vx_load_aligned(sum_r + j) * w);
+                }
+#endif
+                for (; j < size.width; j++)
+                {
+                    CV_DbgAssert(fabs(wsum[j]) > 0);
+                    wsum[j] = 1.f / wsum[j];
+                    *(dptr++) = sum_b[j] * wsum[j];
+                    *(dptr++) = sum_g[j] * wsum[j];
+                    *(dptr++) = sum_r[j] * wsum[j];
                 }
             }
         }
+#if CV_SIMD
+        vx_cleanup();
+#endif
     }
 
 private:

From 2dd98e7cc6cf39c3646ab3e2a0f85d4872773c31 Mon Sep 17 00:00:00 2001
From: Vitaly Tuzov <terfendail@mediana.jetos.com>
Date: Fri, 9 Nov 2018 16:17:58 +0300
Subject: [PATCH 2/2] bilateralFilter implementation moved to separate file

---
 modules/imgproc/src/bilateral_filter.cpp | 759 +++++++++++++++++++++++
 modules/imgproc/src/smooth.cpp           | 708 ---------------------
 2 files changed, 759 insertions(+), 708 deletions(-)
 create mode 100644 modules/imgproc/src/bilateral_filter.cpp

diff --git a/modules/imgproc/src/bilateral_filter.cpp b/modules/imgproc/src/bilateral_filter.cpp
new file mode 100644
index 0000000000..5e39fa4de5
--- /dev/null
+++ b/modules/imgproc/src/bilateral_filter.cpp
@@ -0,0 +1,759 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, 2018, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+#include <vector>
+
+#include "opencv2/core/hal/intrin.hpp"
+#include "opencl_kernels_imgproc.hpp"
+
+/****************************************************************************************\
+                                   Bilateral Filtering
+\****************************************************************************************/
+
+namespace cv
+{
+
+class BilateralFilter_8u_Invoker :
+    public ParallelLoopBody
+{
+public:
+    BilateralFilter_8u_Invoker(Mat& _dest, const Mat& _temp, int _radius, int _maxk,
+        int* _space_ofs, float *_space_weight, float *_color_weight) :
+        temp(&_temp), dest(&_dest), radius(_radius),
+        maxk(_maxk), space_ofs(_space_ofs), space_weight(_space_weight), color_weight(_color_weight)
+    {
+    }
+
+    virtual void operator() (const Range& range) const CV_OVERRIDE
+    {
+        int i, j, cn = dest->channels(), k;
+        Size size = dest->size();
+
+        for( i = range.start; i < range.end; i++ )
+        {
+            const uchar* sptr = temp->ptr(i+radius) + radius*cn;
+            uchar* dptr = dest->ptr(i);
+
+            if( cn == 1 )
+            {
+                AutoBuffer<float> buf(alignSize(size.width, CV_SIMD_WIDTH) + size.width + CV_SIMD_WIDTH - 1);
+                memset(buf.data(), 0, buf.size() * sizeof(float));
+                float *sum = alignPtr(buf.data(), CV_SIMD_WIDTH);
+                float *wsum = sum + alignSize(size.width, CV_SIMD_WIDTH);
+                for( k = 0; k < maxk; k++ )
+                {
+                    const uchar* ksptr = sptr + space_ofs[k];
+                    j = 0;
+#if CV_SIMD
+                    v_float32 kweight = vx_setall_f32(space_weight[k]);
+                    for (; j <= size.width - v_float32::nlanes; j += v_float32::nlanes)
+                    {
+                        v_uint32 val = vx_load_expand_q(ksptr + j);
+                        v_float32 w = kweight * v_lut(color_weight, v_reinterpret_as_s32(v_absdiff(val, vx_load_expand_q(sptr + j))));
+                        v_store_aligned(wsum + j, vx_load_aligned(wsum + j) + w);
+                        v_store_aligned(sum + j, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val)), w, vx_load_aligned(sum + j)));
+                    }
+#endif
+                    for (; j < size.width; j++)
+                    {
+                        int val = ksptr[j];
+                        float w = space_weight[k] * color_weight[std::abs(val - sptr[j])];
+                        wsum[j] += w;
+                        sum[j] += val * w;
+                    }
+                }
+                j = 0;
+#if CV_SIMD
+                for (; j <= size.width - 2*v_float32::nlanes; j += 2*v_float32::nlanes)
+                    v_pack_u_store(dptr + j, v_pack(v_round(vx_load_aligned(sum + j                    ) / vx_load_aligned(wsum + j                    )),
+                                                    v_round(vx_load_aligned(sum + j + v_float32::nlanes) / vx_load_aligned(wsum + j + v_float32::nlanes))));
+#endif
+                for (; j < size.width; j++)
+                {
+                    // overflow is not possible here => there is no need to use cv::saturate_cast
+                    CV_DbgAssert(fabs(wsum[j]) > 0);
+                    dptr[j] = (uchar)cvRound(sum[j]/wsum[j]);
+                }
+            }
+            else
+            {
+                assert( cn == 3 );
+                AutoBuffer<float> buf(alignSize(size.width, CV_SIMD_WIDTH)*3 + size.width + CV_SIMD_WIDTH - 1);
+                memset(buf.data(), 0, buf.size() * sizeof(float));
+                float *sum_b = alignPtr(buf.data(), CV_SIMD_WIDTH);
+                float *sum_g = sum_b + alignSize(size.width, CV_SIMD_WIDTH);
+                float *sum_r = sum_g + alignSize(size.width, CV_SIMD_WIDTH);
+                float *wsum = sum_r + alignSize(size.width, CV_SIMD_WIDTH);
+                for(k = 0; k < maxk; k++ )
+                {
+                    const uchar* ksptr = sptr + space_ofs[k];
+                    const uchar* rsptr = sptr;
+                    j = 0;
+#if CV_SIMD
+                    v_float32 kweight = vx_setall_f32(space_weight[k]);
+                    for (; j <= size.width - v_uint8::nlanes; j += v_uint8::nlanes, ksptr += 3*v_uint8::nlanes, rsptr += 3*v_uint8::nlanes)
+                    {
+                        v_uint8 kb, kg, kr, rb, rg, rr;
+                        v_load_deinterleave(ksptr, kb, kg, kr);
+                        v_load_deinterleave(rsptr, rb, rg, rr);
+
+                        v_uint16 b_l, b_h, g_l, g_h, r_l, r_h;
+                        v_expand(v_absdiff(kb, rb), b_l, b_h);
+                        v_expand(v_absdiff(kg, rg), g_l, g_h);
+                        v_expand(v_absdiff(kr, rr), r_l, r_h);
+
+                        v_uint32 val0, val1, val2, val3;
+                        v_expand(b_l + g_l + r_l, val0, val1);
+                        v_expand(b_h + g_h + r_h, val2, val3);
+
+                        v_expand(kb, b_l, b_h);
+                        v_expand(kg, g_l, g_h);
+                        v_expand(kr, r_l, r_h);
+
+                        v_float32 w0 = kweight * v_lut(color_weight, v_reinterpret_as_s32(val0));
+                        v_float32 w1 = kweight * v_lut(color_weight, v_reinterpret_as_s32(val1));
+                        v_float32 w2 = kweight * v_lut(color_weight, v_reinterpret_as_s32(val2));
+                        v_float32 w3 = kweight * v_lut(color_weight, v_reinterpret_as_s32(val3));
+                        v_store_aligned(wsum + j                      , w0 + vx_load_aligned(wsum + j));
+                        v_store_aligned(wsum + j +   v_float32::nlanes, w1 + vx_load_aligned(wsum + j + v_float32::nlanes));
+                        v_store_aligned(wsum + j + 2*v_float32::nlanes, w2 + vx_load_aligned(wsum + j + 2*v_float32::nlanes));
+                        v_store_aligned(wsum + j + 3*v_float32::nlanes, w3 + vx_load_aligned(wsum + j + 3*v_float32::nlanes));
+                        v_expand(b_l, val0, val1);
+                        v_expand(b_h, val2, val3);
+                        v_store_aligned(sum_b + j                      , v_muladd(v_cvt_f32(v_reinterpret_as_s32(val0)), w0, vx_load_aligned(sum_b + j)));
+                        v_store_aligned(sum_b + j +   v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val1)), w1, vx_load_aligned(sum_b + j + v_float32::nlanes)));
+                        v_store_aligned(sum_b + j + 2*v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val2)), w2, vx_load_aligned(sum_b + j + 2*v_float32::nlanes)));
+                        v_store_aligned(sum_b + j + 3*v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val3)), w3, vx_load_aligned(sum_b + j + 3*v_float32::nlanes)));
+                        v_expand(g_l, val0, val1);
+                        v_expand(g_h, val2, val3);
+                        v_store_aligned(sum_g + j                      , v_muladd(v_cvt_f32(v_reinterpret_as_s32(val0)), w0, vx_load_aligned(sum_g + j)));
+                        v_store_aligned(sum_g + j +   v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val1)), w1, vx_load_aligned(sum_g + j + v_float32::nlanes)));
+                        v_store_aligned(sum_g + j + 2*v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val2)), w2, vx_load_aligned(sum_g + j + 2*v_float32::nlanes)));
+                        v_store_aligned(sum_g + j + 3*v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val3)), w3, vx_load_aligned(sum_g + j + 3*v_float32::nlanes)));
+                        v_expand(r_l, val0, val1);
+                        v_expand(r_h, val2, val3);
+                        v_store_aligned(sum_r + j                      , v_muladd(v_cvt_f32(v_reinterpret_as_s32(val0)), w0, vx_load_aligned(sum_r + j)));
+                        v_store_aligned(sum_r + j +   v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val1)), w1, vx_load_aligned(sum_r + j + v_float32::nlanes)));
+                        v_store_aligned(sum_r + j + 2*v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val2)), w2, vx_load_aligned(sum_r + j + 2*v_float32::nlanes)));
+                        v_store_aligned(sum_r + j + 3*v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val3)), w3, vx_load_aligned(sum_r + j + 3*v_float32::nlanes)));
+                    }
+#endif
+                    for(; j < size.width; j++, ksptr += 3, rsptr += 3)
+                    {
+                        int b = ksptr[0], g = ksptr[1], r = ksptr[2];
+                        float w = space_weight[k]*color_weight[std::abs(b - rsptr[0]) + std::abs(g - rsptr[1]) + std::abs(r - rsptr[2])];
+                        wsum[j] += w;
+                        sum_b[j] += b*w; sum_g[j] += g*w; sum_r[j] += r*w;
+                    }
+                }
+                j = 0;
+#if CV_SIMD
+                v_float32 v_one = vx_setall_f32(1.f);
+                for(; j <= size.width - v_uint8::nlanes; j += v_uint8::nlanes, dptr += 3*v_uint8::nlanes)
+                {
+                    v_float32 w0 = v_one / vx_load_aligned(wsum + j);
+                    v_float32 w1 = v_one / vx_load_aligned(wsum + j + v_float32::nlanes);
+                    v_float32 w2 = v_one / vx_load_aligned(wsum + j + 2*v_float32::nlanes);
+                    v_float32 w3 = v_one / vx_load_aligned(wsum + j + 3*v_float32::nlanes);
+
+                    v_store_interleave(dptr, v_pack_u(v_pack(v_round(w0 * vx_load_aligned(sum_b + j)),
+                                                             v_round(w1 * vx_load_aligned(sum_b + j + v_float32::nlanes))),
+                                                      v_pack(v_round(w2 * vx_load_aligned(sum_b + j + 2*v_float32::nlanes)),
+                                                             v_round(w3 * vx_load_aligned(sum_b + j + 3*v_float32::nlanes)))),
+                                             v_pack_u(v_pack(v_round(w0 * vx_load_aligned(sum_g + j)),
+                                                             v_round(w1 * vx_load_aligned(sum_g + j + v_float32::nlanes))),
+                                                      v_pack(v_round(w2 * vx_load_aligned(sum_g + j + 2*v_float32::nlanes)),
+                                                             v_round(w3 * vx_load_aligned(sum_g + j + 3*v_float32::nlanes)))),
+                                             v_pack_u(v_pack(v_round(w0 * vx_load_aligned(sum_r + j)),
+                                                             v_round(w1 * vx_load_aligned(sum_r + j + v_float32::nlanes))),
+                                                      v_pack(v_round(w2 * vx_load_aligned(sum_r + j + 2*v_float32::nlanes)),
+                                                             v_round(w3 * vx_load_aligned(sum_r + j + 3*v_float32::nlanes)))));
+                }
+#endif
+                for(; j < size.width; j++)
+                {
+                    CV_DbgAssert(fabs(wsum[j]) > 0);
+                    wsum[j] = 1.f/wsum[j];
+                    *(dptr++) = (uchar)cvRound(sum_b[j]*wsum[j]);
+                    *(dptr++) = (uchar)cvRound(sum_g[j]*wsum[j]);
+                    *(dptr++) = (uchar)cvRound(sum_r[j]*wsum[j]);
+                }
+            }
+        }
+#if CV_SIMD
+        vx_cleanup();
+#endif
+    }
+
+private:
+    const Mat *temp;
+    Mat *dest;
+    int radius, maxk, *space_ofs;
+    float *space_weight, *color_weight;
+};
+
+#ifdef HAVE_OPENCL
+
+static bool ocl_bilateralFilter_8u(InputArray _src, OutputArray _dst, int d,
+                                   double sigma_color, double sigma_space,
+                                   int borderType)
+{
+#ifdef __ANDROID__
+    if (ocl::Device::getDefault().isNVidia())
+        return false;
+#endif
+
+    int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
+    int i, j, maxk, radius;
+
+    if (depth != CV_8U || cn > 4)
+        return false;
+
+    if (sigma_color <= 0)
+        sigma_color = 1;
+    if (sigma_space <= 0)
+        sigma_space = 1;
+
+    double gauss_color_coeff = -0.5 / (sigma_color * sigma_color);
+    double gauss_space_coeff = -0.5 / (sigma_space * sigma_space);
+
+    if ( d <= 0 )
+        radius = cvRound(sigma_space * 1.5);
+    else
+        radius = d / 2;
+    radius = MAX(radius, 1);
+    d = radius * 2 + 1;
+
+    UMat src = _src.getUMat(), dst = _dst.getUMat(), temp;
+    if (src.u == dst.u)
+        return false;
+
+    copyMakeBorder(src, temp, radius, radius, radius, radius, borderType);
+    std::vector<float> _space_weight(d * d);
+    std::vector<int> _space_ofs(d * d);
+    float * const space_weight = &_space_weight[0];
+    int * const space_ofs = &_space_ofs[0];
+
+    // initialize space-related bilateral filter coefficients
+    for( i = -radius, maxk = 0; i <= radius; i++ )
+        for( j = -radius; j <= radius; j++ )
+        {
+            double r = std::sqrt((double)i * i + (double)j * j);
+            if ( r > radius )
+                continue;
+            space_weight[maxk] = (float)std::exp(r * r * gauss_space_coeff);
+            space_ofs[maxk++] = (int)(i * temp.step + j * cn);
+        }
+
+    char cvt[3][40];
+    String cnstr = cn > 1 ? format("%d", cn) : "";
+    String kernelName("bilateral");
+    size_t sizeDiv = 1;
+    if ((ocl::Device::getDefault().isIntel()) &&
+        (ocl::Device::getDefault().type() == ocl::Device::TYPE_GPU))
+    {
+            //Intel GPU
+            if (dst.cols % 4 == 0 && cn == 1) // For single channel x4 sized images.
+            {
+                kernelName = "bilateral_float4";
+                sizeDiv = 4;
+            }
+     }
+     ocl::Kernel k(kernelName.c_str(), ocl::imgproc::bilateral_oclsrc,
+            format("-D radius=%d -D maxk=%d -D cn=%d -D int_t=%s -D uint_t=uint%s -D convert_int_t=%s"
+            " -D uchar_t=%s -D float_t=%s -D convert_float_t=%s -D convert_uchar_t=%s -D gauss_color_coeff=(float)%f",
+            radius, maxk, cn, ocl::typeToStr(CV_32SC(cn)), cnstr.c_str(),
+            ocl::convertTypeStr(CV_8U, CV_32S, cn, cvt[0]),
+            ocl::typeToStr(type), ocl::typeToStr(CV_32FC(cn)),
+            ocl::convertTypeStr(CV_32S, CV_32F, cn, cvt[1]),
+            ocl::convertTypeStr(CV_32F, CV_8U, cn, cvt[2]), gauss_color_coeff));
+    if (k.empty())
+        return false;
+
+    Mat mspace_weight(1, d * d, CV_32FC1, space_weight);
+    Mat mspace_ofs(1, d * d, CV_32SC1, space_ofs);
+    UMat ucolor_weight, uspace_weight, uspace_ofs;
+
+    mspace_weight.copyTo(uspace_weight);
+    mspace_ofs.copyTo(uspace_ofs);
+
+    k.args(ocl::KernelArg::ReadOnlyNoSize(temp), ocl::KernelArg::WriteOnly(dst),
+           ocl::KernelArg::PtrReadOnly(uspace_weight),
+           ocl::KernelArg::PtrReadOnly(uspace_ofs));
+
+    size_t globalsize[2] = { (size_t)dst.cols / sizeDiv, (size_t)dst.rows };
+    return k.run(2, globalsize, NULL, false);
+}
+
+#endif
+static void
+bilateralFilter_8u( const Mat& src, Mat& dst, int d,
+    double sigma_color, double sigma_space,
+    int borderType )
+{
+    int cn = src.channels();
+    int i, j, maxk, radius;
+    Size size = src.size();
+
+    CV_Assert( (src.type() == CV_8UC1 || src.type() == CV_8UC3) && src.data != dst.data );
+
+    if( sigma_color <= 0 )
+        sigma_color = 1;
+    if( sigma_space <= 0 )
+        sigma_space = 1;
+
+    double gauss_color_coeff = -0.5/(sigma_color*sigma_color);
+    double gauss_space_coeff = -0.5/(sigma_space*sigma_space);
+
+    if( d <= 0 )
+        radius = cvRound(sigma_space*1.5);
+    else
+        radius = d/2;
+    radius = MAX(radius, 1);
+    d = radius*2 + 1;
+
+    Mat temp;
+    copyMakeBorder( src, temp, radius, radius, radius, radius, borderType );
+
+    std::vector<float> _color_weight(cn*256);
+    std::vector<float> _space_weight(d*d);
+    std::vector<int> _space_ofs(d*d);
+    float* color_weight = &_color_weight[0];
+    float* space_weight = &_space_weight[0];
+    int* space_ofs = &_space_ofs[0];
+
+    // initialize color-related bilateral filter coefficients
+
+    for( i = 0; i < 256*cn; i++ )
+        color_weight[i] = (float)std::exp(i*i*gauss_color_coeff);
+
+    // initialize space-related bilateral filter coefficients
+    for( i = -radius, maxk = 0; i <= radius; i++ )
+    {
+        j = -radius;
+
+        for( ; j <= radius; j++ )
+        {
+            double r = std::sqrt((double)i*i + (double)j*j);
+            if( r > radius )
+                continue;
+            space_weight[maxk] = (float)std::exp(r*r*gauss_space_coeff);
+            space_ofs[maxk++] = (int)(i*temp.step + j*cn);
+        }
+    }
+
+    BilateralFilter_8u_Invoker body(dst, temp, radius, maxk, space_ofs, space_weight, color_weight);
+    parallel_for_(Range(0, size.height), body, dst.total()/(double)(1<<16));
+}
+
+
+class BilateralFilter_32f_Invoker :
+    public ParallelLoopBody
+{
+public:
+
+    BilateralFilter_32f_Invoker(int _cn, int _radius, int _maxk, int *_space_ofs,
+        const Mat& _temp, Mat& _dest, float _scale_index, float *_space_weight, float *_expLUT) :
+        cn(_cn), radius(_radius), maxk(_maxk), space_ofs(_space_ofs),
+        temp(&_temp), dest(&_dest), scale_index(_scale_index), space_weight(_space_weight), expLUT(_expLUT)
+    {
+    }
+
+    virtual void operator() (const Range& range) const CV_OVERRIDE
+    {
+        int i, j, k;
+        Size size = dest->size();
+
+        for( i = range.start; i < range.end; i++ )
+        {
+            const float* sptr = temp->ptr<float>(i+radius) + radius*cn;
+            float* dptr = dest->ptr<float>(i);
+
+            if( cn == 1 )
+            {
+                AutoBuffer<float> buf(alignSize(size.width, CV_SIMD_WIDTH) + size.width + CV_SIMD_WIDTH - 1);
+                memset(buf.data(), 0, buf.size() * sizeof(float));
+                float *sum = alignPtr(buf.data(), CV_SIMD_WIDTH);
+                float *wsum = sum + alignSize(size.width, CV_SIMD_WIDTH);
+#if CV_SIMD
+                v_float32 v_one = vx_setall_f32(1.f);
+                v_float32 sindex = vx_setall_f32(scale_index);
+#endif
+                for( k = 0; k < maxk; k++ )
+                {
+                    const float* ksptr = sptr + space_ofs[k];
+                    j = 0;
+#if CV_SIMD
+                    v_float32 kweight = vx_setall_f32(space_weight[k]);
+                    for (; j <= size.width - v_float32::nlanes; j += v_float32::nlanes)
+                    {
+                        v_float32 val = vx_load(ksptr + j);
+
+                        v_float32 alpha = v_absdiff(val, vx_load(sptr + j)) * sindex;
+                        v_int32 idx = v_trunc(alpha);
+                        alpha -= v_cvt_f32(idx);
+
+                        v_float32 w = kweight * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one-alpha));
+                        v_store_aligned(wsum + j, vx_load_aligned(wsum + j) + w);
+                        v_store_aligned(sum + j, v_muladd(val, w, vx_load_aligned(sum + j)));
+                    }
+#endif
+                    for (; j < size.width; j++)
+                    {
+                        float val = ksptr[j];
+                        float alpha = std::abs(val - sptr[j]) * scale_index;
+                        int idx = cvFloor(alpha);
+                        alpha -= idx;
+                        float w = space_weight[k] * (expLUT[idx] + alpha*(expLUT[idx+1] - expLUT[idx]));
+                        wsum[j] += w;
+                        sum[j] += val * w;
+                    }
+                }
+                j = 0;
+#if CV_SIMD
+                for (; j <= size.width - v_float32::nlanes; j += v_float32::nlanes)
+                    v_store(dptr + j, vx_load_aligned(sum + j) / vx_load_aligned(wsum + j));
+#endif
+                for (; j < size.width; j++)
+                {
+                    CV_DbgAssert(fabs(wsum[j]) > 0);
+                    dptr[j] = sum[j] / wsum[j];
+                }
+            }
+            else
+            {
+                CV_Assert( cn == 3 );
+                AutoBuffer<float> buf(alignSize(size.width, CV_SIMD_WIDTH)*3 + size.width + CV_SIMD_WIDTH - 1);
+                memset(buf.data(), 0, buf.size() * sizeof(float));
+                float *sum_b = alignPtr(buf.data(), CV_SIMD_WIDTH);
+                float *sum_g = sum_b + alignSize(size.width, CV_SIMD_WIDTH);
+                float *sum_r = sum_g + alignSize(size.width, CV_SIMD_WIDTH);
+                float *wsum = sum_r + alignSize(size.width, CV_SIMD_WIDTH);
+#if CV_SIMD
+                v_float32 v_one = vx_setall_f32(1.f);
+                v_float32 sindex = vx_setall_f32(scale_index);
+#endif
+                for (k = 0; k < maxk; k++)
+                {
+                    const float* ksptr = sptr + space_ofs[k];
+                    const float* rsptr = sptr;
+                    j = 0;
+#if CV_SIMD
+                    v_float32 kweight = vx_setall_f32(space_weight[k]);
+                    for (; j <= size.width - v_float32::nlanes; j += v_float32::nlanes, ksptr += 3*v_float32::nlanes, rsptr += 3*v_float32::nlanes)
+                    {
+                        v_float32 kb, kg, kr, rb, rg, rr;
+                        v_load_deinterleave(ksptr, kb, kg, kr);
+                        v_load_deinterleave(rsptr, rb, rg, rr);
+
+                        v_float32 alpha = (v_absdiff(kb, rb) + v_absdiff(kg, rg) + v_absdiff(kr, rr)) * sindex;
+                        v_int32 idx = v_trunc(alpha);
+                        alpha -= v_cvt_f32(idx);
+
+                        v_float32 w = kweight * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one - alpha));
+                        v_store_aligned(wsum + j, vx_load_aligned(wsum + j) + w);
+                        v_store_aligned(sum_b + j, v_muladd(kb, w, vx_load_aligned(sum_b + j)));
+                        v_store_aligned(sum_g + j, v_muladd(kg, w, vx_load_aligned(sum_g + j)));
+                        v_store_aligned(sum_r + j, v_muladd(kr, w, vx_load_aligned(sum_r + j)));
+                    }
+#endif
+                    for (; j < size.width; j++, ksptr += 3, rsptr += 3)
+                    {
+                        float b = ksptr[0], g = ksptr[1], r = ksptr[2];
+                        float alpha = (std::abs(b - rsptr[0]) + std::abs(g - rsptr[1]) + std::abs(r - rsptr[2])) * scale_index;
+                        int idx = cvFloor(alpha);
+                        alpha -= idx;
+                        float w = space_weight[k] * (expLUT[idx] + alpha*(expLUT[idx + 1] - expLUT[idx]));
+                        wsum[j] += w;
+                        sum_b[j] += b*w;
+                        sum_g[j] += g*w;
+                        sum_r[j] += r*w;
+                    }
+                }
+                j = 0;
+#if CV_SIMD
+                for (; j <= size.width - v_float32::nlanes; j += v_float32::nlanes, dptr += 3*v_float32::nlanes)
+                {
+                    v_float32 w = v_one / vx_load_aligned(wsum + j);
+                    v_store_interleave(dptr, vx_load_aligned(sum_b + j) * w, vx_load_aligned(sum_g + j) * w, vx_load_aligned(sum_r + j) * w);
+                }
+#endif
+                for (; j < size.width; j++)
+                {
+                    CV_DbgAssert(fabs(wsum[j]) > 0);
+                    wsum[j] = 1.f / wsum[j];
+                    *(dptr++) = sum_b[j] * wsum[j];
+                    *(dptr++) = sum_g[j] * wsum[j];
+                    *(dptr++) = sum_r[j] * wsum[j];
+                }
+            }
+        }
+#if CV_SIMD
+        vx_cleanup();
+#endif
+    }
+
+private:
+    int cn, radius, maxk, *space_ofs;
+    const Mat* temp;
+    Mat *dest;
+    float scale_index, *space_weight, *expLUT;
+};
+
+
+static void
+bilateralFilter_32f( const Mat& src, Mat& dst, int d,
+                     double sigma_color, double sigma_space,
+                     int borderType )
+{
+    int cn = src.channels();
+    int i, j, maxk, radius;
+    double minValSrc=-1, maxValSrc=1;
+    const int kExpNumBinsPerChannel = 1 << 12;
+    int kExpNumBins = 0;
+    float lastExpVal = 1.f;
+    float len, scale_index;
+    Size size = src.size();
+
+    CV_Assert( (src.type() == CV_32FC1 || src.type() == CV_32FC3) && src.data != dst.data );
+
+    if( sigma_color <= 0 )
+        sigma_color = 1;
+    if( sigma_space <= 0 )
+        sigma_space = 1;
+
+    double gauss_color_coeff = -0.5/(sigma_color*sigma_color);
+    double gauss_space_coeff = -0.5/(sigma_space*sigma_space);
+
+    if( d <= 0 )
+        radius = cvRound(sigma_space*1.5);
+    else
+        radius = d/2;
+    radius = MAX(radius, 1);
+    d = radius*2 + 1;
+    // compute the min/max range for the input image (even if multichannel)
+
+    minMaxLoc( src.reshape(1), &minValSrc, &maxValSrc );
+    if(std::abs(minValSrc - maxValSrc) < FLT_EPSILON)
+    {
+        src.copyTo(dst);
+        return;
+    }
+
+    // temporary copy of the image with borders for easy processing
+    Mat temp;
+    copyMakeBorder( src, temp, radius, radius, radius, radius, borderType );
+    minValSrc -= 5. * sigma_color;
+    patchNaNs( temp, minValSrc ); // this replacement of NaNs makes the assumption that depth values are nonnegative
+                                  // TODO: make replacement parameter avalible in the outside function interface
+    // allocate lookup tables
+    std::vector<float> _space_weight(d*d);
+    std::vector<int> _space_ofs(d*d);
+    float* space_weight = &_space_weight[0];
+    int* space_ofs = &_space_ofs[0];
+
+    // assign a length which is slightly more than needed
+    len = (float)(maxValSrc - minValSrc) * cn;
+    kExpNumBins = kExpNumBinsPerChannel * cn;
+    std::vector<float> _expLUT(kExpNumBins+2);
+    float* expLUT = &_expLUT[0];
+
+    scale_index = kExpNumBins/len;
+
+    // initialize the exp LUT
+    for( i = 0; i < kExpNumBins+2; i++ )
+    {
+        if( lastExpVal > 0.f )
+        {
+            double val =  i / scale_index;
+            expLUT[i] = (float)std::exp(val * val * gauss_color_coeff);
+            lastExpVal = expLUT[i];
+        }
+        else
+            expLUT[i] = 0.f;
+    }
+
+    // initialize space-related bilateral filter coefficients
+    for( i = -radius, maxk = 0; i <= radius; i++ )
+        for( j = -radius; j <= radius; j++ )
+        {
+            double r = std::sqrt((double)i*i + (double)j*j);
+            if( r > radius )
+                continue;
+            space_weight[maxk] = (float)std::exp(r*r*gauss_space_coeff);
+            space_ofs[maxk++] = (int)(i*(temp.step/sizeof(float)) + j*cn);
+        }
+
+    // parallel_for usage
+
+    BilateralFilter_32f_Invoker body(cn, radius, maxk, space_ofs, temp, dst, scale_index, space_weight, expLUT);
+    parallel_for_(Range(0, size.height), body, dst.total()/(double)(1<<16));
+}
+
+#ifdef HAVE_IPP
+#define IPP_BILATERAL_PARALLEL 1
+
+#ifdef HAVE_IPP_IW
+class ipp_bilateralFilterParallel: public ParallelLoopBody
+{
+public:
+    ipp_bilateralFilterParallel(::ipp::IwiImage &_src, ::ipp::IwiImage &_dst, int _radius, Ipp32f _valSquareSigma, Ipp32f _posSquareSigma, ::ipp::IwiBorderType _borderType, bool *_ok):
+        src(_src), dst(_dst)
+    {
+        pOk = _ok;
+
+        radius          = _radius;
+        valSquareSigma  = _valSquareSigma;
+        posSquareSigma  = _posSquareSigma;
+        borderType      = _borderType;
+
+        *pOk = true;
+    }
+    ~ipp_bilateralFilterParallel() {}
+
+    virtual void operator() (const Range& range) const CV_OVERRIDE
+    {
+        if(*pOk == false)
+            return;
+
+        try
+        {
+            ::ipp::IwiTile tile = ::ipp::IwiRoi(0, range.start, dst.m_size.width, range.end - range.start);
+            CV_INSTRUMENT_FUN_IPP(::ipp::iwiFilterBilateral, src, dst, radius, valSquareSigma, posSquareSigma, ::ipp::IwDefault(), borderType, tile);
+        }
+        catch(const ::ipp::IwException &)
+        {
+            *pOk = false;
+            return;
+        }
+    }
+private:
+    ::ipp::IwiImage &src;
+    ::ipp::IwiImage &dst;
+
+    int                  radius;
+    Ipp32f               valSquareSigma;
+    Ipp32f               posSquareSigma;
+    ::ipp::IwiBorderType borderType;
+
+    bool  *pOk;
+    const ipp_bilateralFilterParallel& operator= (const ipp_bilateralFilterParallel&);
+};
+#endif
+
+static bool ipp_bilateralFilter(Mat &src, Mat &dst, int d, double sigmaColor, double sigmaSpace, int borderType)
+{
+#ifdef HAVE_IPP_IW
+    CV_INSTRUMENT_REGION_IPP();
+
+    int         radius         = IPP_MAX(((d <= 0)?cvRound(sigmaSpace*1.5):d/2), 1);
+    Ipp32f      valSquareSigma = (Ipp32f)((sigmaColor <= 0)?1:sigmaColor*sigmaColor);
+    Ipp32f      posSquareSigma = (Ipp32f)((sigmaSpace <= 0)?1:sigmaSpace*sigmaSpace);
+
+    // Acquire data and begin processing
+    try
+    {
+        ::ipp::IwiImage      iwSrc = ippiGetImage(src);
+        ::ipp::IwiImage      iwDst = ippiGetImage(dst);
+        ::ipp::IwiBorderSize borderSize(radius);
+        ::ipp::IwiBorderType ippBorder(ippiGetBorder(iwSrc, borderType, borderSize));
+        if(!ippBorder)
+            return false;
+
+        const int threads = ippiSuggestThreadsNum(iwDst, 2);
+        if(IPP_BILATERAL_PARALLEL && threads > 1) {
+            bool  ok      = true;
+            Range range(0, (int)iwDst.m_size.height);
+            ipp_bilateralFilterParallel invoker(iwSrc, iwDst, radius, valSquareSigma, posSquareSigma, ippBorder, &ok);
+            if(!ok)
+                return false;
+
+            parallel_for_(range, invoker, threads*4);
+
+            if(!ok)
+                return false;
+        } else {
+            CV_INSTRUMENT_FUN_IPP(::ipp::iwiFilterBilateral, iwSrc, iwDst, radius, valSquareSigma, posSquareSigma, ::ipp::IwDefault(), ippBorder);
+        }
+    }
+    catch (const ::ipp::IwException &)
+    {
+        return false;
+    }
+    return true;
+#else
+    CV_UNUSED(src); CV_UNUSED(dst); CV_UNUSED(d); CV_UNUSED(sigmaColor); CV_UNUSED(sigmaSpace); CV_UNUSED(borderType);
+    return false;
+#endif
+}
+#endif
+
+}
+
+void cv::bilateralFilter( InputArray _src, OutputArray _dst, int d,
+                      double sigmaColor, double sigmaSpace,
+                      int borderType )
+{
+    CV_INSTRUMENT_REGION();
+
+    _dst.create( _src.size(), _src.type() );
+
+    CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat(),
+               ocl_bilateralFilter_8u(_src, _dst, d, sigmaColor, sigmaSpace, borderType))
+
+    Mat src = _src.getMat(), dst = _dst.getMat();
+
+    CV_IPP_RUN_FAST(ipp_bilateralFilter(src, dst, d, sigmaColor, sigmaSpace, borderType));
+
+    if( src.depth() == CV_8U )
+        bilateralFilter_8u( src, dst, d, sigmaColor, sigmaSpace, borderType );
+    else if( src.depth() == CV_32F )
+        bilateralFilter_32f( src, dst, d, sigmaColor, sigmaSpace, borderType );
+    else
+        CV_Error( CV_StsUnsupportedFormat,
+        "Bilateral filtering is only implemented for 8u and 32f images" );
+}
+
+/* End of file. */
diff --git a/modules/imgproc/src/smooth.cpp b/modules/imgproc/src/smooth.cpp
index e2fe6a58d5..05b2db5b46 100644
--- a/modules/imgproc/src/smooth.cpp
+++ b/modules/imgproc/src/smooth.cpp
@@ -2505,714 +2505,6 @@ void cv::GaussianBlur( InputArray _src, OutputArray _dst, Size ksize,
     sepFilter2D(src, dst, sdepth, kx, ky, Point(-1, -1), 0, borderType);
 }
 
-/****************************************************************************************\
-                                   Bilateral Filtering
-\****************************************************************************************/
-
-namespace cv
-{
-
-class BilateralFilter_8u_Invoker :
-    public ParallelLoopBody
-{
-public:
-    BilateralFilter_8u_Invoker(Mat& _dest, const Mat& _temp, int _radius, int _maxk,
-        int* _space_ofs, float *_space_weight, float *_color_weight) :
-        temp(&_temp), dest(&_dest), radius(_radius),
-        maxk(_maxk), space_ofs(_space_ofs), space_weight(_space_weight), color_weight(_color_weight)
-    {
-    }
-
-    virtual void operator() (const Range& range) const CV_OVERRIDE
-    {
-        int i, j, cn = dest->channels(), k;
-        Size size = dest->size();
-
-        for( i = range.start; i < range.end; i++ )
-        {
-            const uchar* sptr = temp->ptr(i+radius) + radius*cn;
-            uchar* dptr = dest->ptr(i);
-
-            if( cn == 1 )
-            {
-                AutoBuffer<float> buf(alignSize(size.width, CV_SIMD_WIDTH) + size.width + CV_SIMD_WIDTH - 1);
-                memset(buf.data(), 0, buf.size() * sizeof(float));
-                float *sum = alignPtr(buf.data(), CV_SIMD_WIDTH);
-                float *wsum = sum + alignSize(size.width, CV_SIMD_WIDTH);
-                for( k = 0; k < maxk; k++ )
-                {
-                    const uchar* ksptr = sptr + space_ofs[k];
-                    j = 0;
-#if CV_SIMD
-                    v_float32 kweight = vx_setall_f32(space_weight[k]);
-                    for (; j <= size.width - v_float32::nlanes; j += v_float32::nlanes)
-                    {
-                        v_uint32 val = vx_load_expand_q(ksptr + j);
-                        v_float32 w = kweight * v_lut(color_weight, v_reinterpret_as_s32(v_absdiff(val, vx_load_expand_q(sptr + j))));
-                        v_store_aligned(wsum + j, vx_load_aligned(wsum + j) + w);
-                        v_store_aligned(sum + j, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val)), w, vx_load_aligned(sum + j)));
-                    }
-#endif
-                    for (; j < size.width; j++)
-                    {
-                        int val = ksptr[j];
-                        float w = space_weight[k] * color_weight[std::abs(val - sptr[j])];
-                        wsum[j] += w;
-                        sum[j] += val * w;
-                    }
-                }
-                j = 0;
-#if CV_SIMD
-                for (; j <= size.width - 2*v_float32::nlanes; j += 2*v_float32::nlanes)
-                    v_pack_u_store(dptr + j, v_pack(v_round(vx_load_aligned(sum + j                    ) / vx_load_aligned(wsum + j                    )),
-                                                    v_round(vx_load_aligned(sum + j + v_float32::nlanes) / vx_load_aligned(wsum + j + v_float32::nlanes))));
-#endif
-                for (; j < size.width; j++)
-                {
-                    // overflow is not possible here => there is no need to use cv::saturate_cast
-                    CV_DbgAssert(fabs(wsum[j]) > 0);
-                    dptr[j] = (uchar)cvRound(sum[j]/wsum[j]);
-                }
-            }
-            else
-            {
-                assert( cn == 3 );
-                AutoBuffer<float> buf(alignSize(size.width, CV_SIMD_WIDTH)*3 + size.width + CV_SIMD_WIDTH - 1);
-                memset(buf.data(), 0, buf.size() * sizeof(float));
-                float *sum_b = alignPtr(buf.data(), CV_SIMD_WIDTH);
-                float *sum_g = sum_b + alignSize(size.width, CV_SIMD_WIDTH);
-                float *sum_r = sum_g + alignSize(size.width, CV_SIMD_WIDTH);
-                float *wsum = sum_r + alignSize(size.width, CV_SIMD_WIDTH);
-                for(k = 0; k < maxk; k++ )
-                {
-                    const uchar* ksptr = sptr + space_ofs[k];
-                    const uchar* rsptr = sptr;
-                    j = 0;
-#if CV_SIMD
-                    v_float32 kweight = vx_setall_f32(space_weight[k]);
-                    for (; j <= size.width - v_uint8::nlanes; j += v_uint8::nlanes, ksptr += 3*v_uint8::nlanes, rsptr += 3*v_uint8::nlanes)
-                    {
-                        v_uint8 kb, kg, kr, rb, rg, rr;
-                        v_load_deinterleave(ksptr, kb, kg, kr);
-                        v_load_deinterleave(rsptr, rb, rg, rr);
-
-                        v_uint16 b_l, b_h, g_l, g_h, r_l, r_h;
-                        v_expand(v_absdiff(kb, rb), b_l, b_h);
-                        v_expand(v_absdiff(kg, rg), g_l, g_h);
-                        v_expand(v_absdiff(kr, rr), r_l, r_h);
-
-                        v_uint32 val0, val1, val2, val3;
-                        v_expand(b_l + g_l + r_l, val0, val1);
-                        v_expand(b_h + g_h + r_h, val2, val3);
-
-                        v_expand(kb, b_l, b_h);
-                        v_expand(kg, g_l, g_h);
-                        v_expand(kr, r_l, r_h);
-
-                        v_float32 w0 = kweight * v_lut(color_weight, v_reinterpret_as_s32(val0));
-                        v_float32 w1 = kweight * v_lut(color_weight, v_reinterpret_as_s32(val1));
-                        v_float32 w2 = kweight * v_lut(color_weight, v_reinterpret_as_s32(val2));
-                        v_float32 w3 = kweight * v_lut(color_weight, v_reinterpret_as_s32(val3));
-                        v_store_aligned(wsum + j                      , w0 + vx_load_aligned(wsum + j));
-                        v_store_aligned(wsum + j +   v_float32::nlanes, w1 + vx_load_aligned(wsum + j + v_float32::nlanes));
-                        v_store_aligned(wsum + j + 2*v_float32::nlanes, w2 + vx_load_aligned(wsum + j + 2*v_float32::nlanes));
-                        v_store_aligned(wsum + j + 3*v_float32::nlanes, w3 + vx_load_aligned(wsum + j + 3*v_float32::nlanes));
-                        v_expand(b_l, val0, val1);
-                        v_expand(b_h, val2, val3);
-                        v_store_aligned(sum_b + j                      , v_muladd(v_cvt_f32(v_reinterpret_as_s32(val0)), w0, vx_load_aligned(sum_b + j)));
-                        v_store_aligned(sum_b + j +   v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val1)), w1, vx_load_aligned(sum_b + j + v_float32::nlanes)));
-                        v_store_aligned(sum_b + j + 2*v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val2)), w2, vx_load_aligned(sum_b + j + 2*v_float32::nlanes)));
-                        v_store_aligned(sum_b + j + 3*v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val3)), w3, vx_load_aligned(sum_b + j + 3*v_float32::nlanes)));
-                        v_expand(g_l, val0, val1);
-                        v_expand(g_h, val2, val3);
-                        v_store_aligned(sum_g + j                      , v_muladd(v_cvt_f32(v_reinterpret_as_s32(val0)), w0, vx_load_aligned(sum_g + j)));
-                        v_store_aligned(sum_g + j +   v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val1)), w1, vx_load_aligned(sum_g + j + v_float32::nlanes)));
-                        v_store_aligned(sum_g + j + 2*v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val2)), w2, vx_load_aligned(sum_g + j + 2*v_float32::nlanes)));
-                        v_store_aligned(sum_g + j + 3*v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val3)), w3, vx_load_aligned(sum_g + j + 3*v_float32::nlanes)));
-                        v_expand(r_l, val0, val1);
-                        v_expand(r_h, val2, val3);
-                        v_store_aligned(sum_r + j                      , v_muladd(v_cvt_f32(v_reinterpret_as_s32(val0)), w0, vx_load_aligned(sum_r + j)));
-                        v_store_aligned(sum_r + j +   v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val1)), w1, vx_load_aligned(sum_r + j + v_float32::nlanes)));
-                        v_store_aligned(sum_r + j + 2*v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val2)), w2, vx_load_aligned(sum_r + j + 2*v_float32::nlanes)));
-                        v_store_aligned(sum_r + j + 3*v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val3)), w3, vx_load_aligned(sum_r + j + 3*v_float32::nlanes)));
-                    }
-#endif
-                    for(; j < size.width; j++, ksptr += 3, rsptr += 3)
-                    {
-                        int b = ksptr[0], g = ksptr[1], r = ksptr[2];
-                        float w = space_weight[k]*color_weight[std::abs(b - rsptr[0]) + std::abs(g - rsptr[1]) + std::abs(r - rsptr[2])];
-                        wsum[j] += w;
-                        sum_b[j] += b*w; sum_g[j] += g*w; sum_r[j] += r*w;
-                    }
-                }
-                j = 0;
-#if CV_SIMD
-                v_float32 v_one = vx_setall_f32(1.f);
-                for(; j <= size.width - v_uint8::nlanes; j += v_uint8::nlanes, dptr += 3*v_uint8::nlanes)
-                {
-                    v_float32 w0 = v_one / vx_load_aligned(wsum + j);
-                    v_float32 w1 = v_one / vx_load_aligned(wsum + j + v_float32::nlanes);
-                    v_float32 w2 = v_one / vx_load_aligned(wsum + j + 2*v_float32::nlanes);
-                    v_float32 w3 = v_one / vx_load_aligned(wsum + j + 3*v_float32::nlanes);
-
-                    v_store_interleave(dptr, v_pack_u(v_pack(v_round(w0 * vx_load_aligned(sum_b + j)),
-                                                             v_round(w1 * vx_load_aligned(sum_b + j + v_float32::nlanes))),
-                                                      v_pack(v_round(w2 * vx_load_aligned(sum_b + j + 2*v_float32::nlanes)),
-                                                             v_round(w3 * vx_load_aligned(sum_b + j + 3*v_float32::nlanes)))),
-                                             v_pack_u(v_pack(v_round(w0 * vx_load_aligned(sum_g + j)),
-                                                             v_round(w1 * vx_load_aligned(sum_g + j + v_float32::nlanes))),
-                                                      v_pack(v_round(w2 * vx_load_aligned(sum_g + j + 2*v_float32::nlanes)),
-                                                             v_round(w3 * vx_load_aligned(sum_g + j + 3*v_float32::nlanes)))),
-                                             v_pack_u(v_pack(v_round(w0 * vx_load_aligned(sum_r + j)),
-                                                             v_round(w1 * vx_load_aligned(sum_r + j + v_float32::nlanes))),
-                                                      v_pack(v_round(w2 * vx_load_aligned(sum_r + j + 2*v_float32::nlanes)),
-                                                             v_round(w3 * vx_load_aligned(sum_r + j + 3*v_float32::nlanes)))));
-                }
-#endif
-                for(; j < size.width; j++)
-                {
-                    CV_DbgAssert(fabs(wsum[j]) > 0);
-                    wsum[j] = 1.f/wsum[j];
-                    *(dptr++) = (uchar)cvRound(sum_b[j]*wsum[j]);
-                    *(dptr++) = (uchar)cvRound(sum_g[j]*wsum[j]);
-                    *(dptr++) = (uchar)cvRound(sum_r[j]*wsum[j]);
-                }
-            }
-        }
-#if CV_SIMD
-        vx_cleanup();
-#endif
-    }
-
-private:
-    const Mat *temp;
-    Mat *dest;
-    int radius, maxk, *space_ofs;
-    float *space_weight, *color_weight;
-};
-
-#ifdef HAVE_OPENCL
-
-static bool ocl_bilateralFilter_8u(InputArray _src, OutputArray _dst, int d,
-                                   double sigma_color, double sigma_space,
-                                   int borderType)
-{
-#ifdef __ANDROID__
-    if (ocl::Device::getDefault().isNVidia())
-        return false;
-#endif
-
-    int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
-    int i, j, maxk, radius;
-
-    if (depth != CV_8U || cn > 4)
-        return false;
-
-    if (sigma_color <= 0)
-        sigma_color = 1;
-    if (sigma_space <= 0)
-        sigma_space = 1;
-
-    double gauss_color_coeff = -0.5 / (sigma_color * sigma_color);
-    double gauss_space_coeff = -0.5 / (sigma_space * sigma_space);
-
-    if ( d <= 0 )
-        radius = cvRound(sigma_space * 1.5);
-    else
-        radius = d / 2;
-    radius = MAX(radius, 1);
-    d = radius * 2 + 1;
-
-    UMat src = _src.getUMat(), dst = _dst.getUMat(), temp;
-    if (src.u == dst.u)
-        return false;
-
-    copyMakeBorder(src, temp, radius, radius, radius, radius, borderType);
-    std::vector<float> _space_weight(d * d);
-    std::vector<int> _space_ofs(d * d);
-    float * const space_weight = &_space_weight[0];
-    int * const space_ofs = &_space_ofs[0];
-
-    // initialize space-related bilateral filter coefficients
-    for( i = -radius, maxk = 0; i <= radius; i++ )
-        for( j = -radius; j <= radius; j++ )
-        {
-            double r = std::sqrt((double)i * i + (double)j * j);
-            if ( r > radius )
-                continue;
-            space_weight[maxk] = (float)std::exp(r * r * gauss_space_coeff);
-            space_ofs[maxk++] = (int)(i * temp.step + j * cn);
-        }
-
-    char cvt[3][40];
-    String cnstr = cn > 1 ? format("%d", cn) : "";
-    String kernelName("bilateral");
-    size_t sizeDiv = 1;
-    if ((ocl::Device::getDefault().isIntel()) &&
-        (ocl::Device::getDefault().type() == ocl::Device::TYPE_GPU))
-    {
-            //Intel GPU
-            if (dst.cols % 4 == 0 && cn == 1) // For single channel x4 sized images.
-            {
-                kernelName = "bilateral_float4";
-                sizeDiv = 4;
-            }
-     }
-     ocl::Kernel k(kernelName.c_str(), ocl::imgproc::bilateral_oclsrc,
-            format("-D radius=%d -D maxk=%d -D cn=%d -D int_t=%s -D uint_t=uint%s -D convert_int_t=%s"
-            " -D uchar_t=%s -D float_t=%s -D convert_float_t=%s -D convert_uchar_t=%s -D gauss_color_coeff=(float)%f",
-            radius, maxk, cn, ocl::typeToStr(CV_32SC(cn)), cnstr.c_str(),
-            ocl::convertTypeStr(CV_8U, CV_32S, cn, cvt[0]),
-            ocl::typeToStr(type), ocl::typeToStr(CV_32FC(cn)),
-            ocl::convertTypeStr(CV_32S, CV_32F, cn, cvt[1]),
-            ocl::convertTypeStr(CV_32F, CV_8U, cn, cvt[2]), gauss_color_coeff));
-    if (k.empty())
-        return false;
-
-    Mat mspace_weight(1, d * d, CV_32FC1, space_weight);
-    Mat mspace_ofs(1, d * d, CV_32SC1, space_ofs);
-    UMat ucolor_weight, uspace_weight, uspace_ofs;
-
-    mspace_weight.copyTo(uspace_weight);
-    mspace_ofs.copyTo(uspace_ofs);
-
-    k.args(ocl::KernelArg::ReadOnlyNoSize(temp), ocl::KernelArg::WriteOnly(dst),
-           ocl::KernelArg::PtrReadOnly(uspace_weight),
-           ocl::KernelArg::PtrReadOnly(uspace_ofs));
-
-    size_t globalsize[2] = { (size_t)dst.cols / sizeDiv, (size_t)dst.rows };
-    return k.run(2, globalsize, NULL, false);
-}
-
-#endif
-static void
-bilateralFilter_8u( const Mat& src, Mat& dst, int d,
-    double sigma_color, double sigma_space,
-    int borderType )
-{
-    int cn = src.channels();
-    int i, j, maxk, radius;
-    Size size = src.size();
-
-    CV_Assert( (src.type() == CV_8UC1 || src.type() == CV_8UC3) && src.data != dst.data );
-
-    if( sigma_color <= 0 )
-        sigma_color = 1;
-    if( sigma_space <= 0 )
-        sigma_space = 1;
-
-    double gauss_color_coeff = -0.5/(sigma_color*sigma_color);
-    double gauss_space_coeff = -0.5/(sigma_space*sigma_space);
-
-    if( d <= 0 )
-        radius = cvRound(sigma_space*1.5);
-    else
-        radius = d/2;
-    radius = MAX(radius, 1);
-    d = radius*2 + 1;
-
-    Mat temp;
-    copyMakeBorder( src, temp, radius, radius, radius, radius, borderType );
-
-    std::vector<float> _color_weight(cn*256);
-    std::vector<float> _space_weight(d*d);
-    std::vector<int> _space_ofs(d*d);
-    float* color_weight = &_color_weight[0];
-    float* space_weight = &_space_weight[0];
-    int* space_ofs = &_space_ofs[0];
-
-    // initialize color-related bilateral filter coefficients
-
-    for( i = 0; i < 256*cn; i++ )
-        color_weight[i] = (float)std::exp(i*i*gauss_color_coeff);
-
-    // initialize space-related bilateral filter coefficients
-    for( i = -radius, maxk = 0; i <= radius; i++ )
-    {
-        j = -radius;
-
-        for( ; j <= radius; j++ )
-        {
-            double r = std::sqrt((double)i*i + (double)j*j);
-            if( r > radius )
-                continue;
-            space_weight[maxk] = (float)std::exp(r*r*gauss_space_coeff);
-            space_ofs[maxk++] = (int)(i*temp.step + j*cn);
-        }
-    }
-
-    BilateralFilter_8u_Invoker body(dst, temp, radius, maxk, space_ofs, space_weight, color_weight);
-    parallel_for_(Range(0, size.height), body, dst.total()/(double)(1<<16));
-}
-
-
-class BilateralFilter_32f_Invoker :
-    public ParallelLoopBody
-{
-public:
-
-    BilateralFilter_32f_Invoker(int _cn, int _radius, int _maxk, int *_space_ofs,
-        const Mat& _temp, Mat& _dest, float _scale_index, float *_space_weight, float *_expLUT) :
-        cn(_cn), radius(_radius), maxk(_maxk), space_ofs(_space_ofs),
-        temp(&_temp), dest(&_dest), scale_index(_scale_index), space_weight(_space_weight), expLUT(_expLUT)
-    {
-    }
-
-    virtual void operator() (const Range& range) const CV_OVERRIDE
-    {
-        int i, j, k;
-        Size size = dest->size();
-
-        for( i = range.start; i < range.end; i++ )
-        {
-            const float* sptr = temp->ptr<float>(i+radius) + radius*cn;
-            float* dptr = dest->ptr<float>(i);
-
-            if( cn == 1 )
-            {
-                AutoBuffer<float> buf(alignSize(size.width, CV_SIMD_WIDTH) + size.width + CV_SIMD_WIDTH - 1);
-                memset(buf.data(), 0, buf.size() * sizeof(float));
-                float *sum = alignPtr(buf.data(), CV_SIMD_WIDTH);
-                float *wsum = sum + alignSize(size.width, CV_SIMD_WIDTH);
-#if CV_SIMD
-                v_float32 v_one = vx_setall_f32(1.f);
-                v_float32 sindex = vx_setall_f32(scale_index);
-#endif
-                for( k = 0; k < maxk; k++ )
-                {
-                    const float* ksptr = sptr + space_ofs[k];
-                    j = 0;
-#if CV_SIMD
-                    v_float32 kweight = vx_setall_f32(space_weight[k]);
-                    for (; j <= size.width - v_float32::nlanes; j += v_float32::nlanes)
-                    {
-                        v_float32 val = vx_load(ksptr + j);
-
-                        v_float32 alpha = v_absdiff(val, vx_load(sptr + j)) * sindex;
-                        v_int32 idx = v_trunc(alpha);
-                        alpha -= v_cvt_f32(idx);
-
-                        v_float32 w = kweight * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one-alpha));
-                        v_store_aligned(wsum + j, vx_load_aligned(wsum + j) + w);
-                        v_store_aligned(sum + j, v_muladd(val, w, vx_load_aligned(sum + j)));
-                    }
-#endif
-                    for (; j < size.width; j++)
-                    {
-                        float val = ksptr[j];
-                        float alpha = std::abs(val - sptr[j]) * scale_index;
-                        int idx = cvFloor(alpha);
-                        alpha -= idx;
-                        float w = space_weight[k] * (expLUT[idx] + alpha*(expLUT[idx+1] - expLUT[idx]));
-                        wsum[j] += w;
-                        sum[j] += val * w;
-                    }
-                }
-                j = 0;
-#if CV_SIMD
-                for (; j <= size.width - v_float32::nlanes; j += v_float32::nlanes)
-                    v_store(dptr + j, vx_load_aligned(sum + j) / vx_load_aligned(wsum + j));
-#endif
-                for (; j < size.width; j++)
-                {
-                    CV_DbgAssert(fabs(wsum[j]) > 0);
-                    dptr[j] = sum[j] / wsum[j];
-                }
-            }
-            else
-            {
-                CV_Assert( cn == 3 );
-                AutoBuffer<float> buf(alignSize(size.width, CV_SIMD_WIDTH)*3 + size.width + CV_SIMD_WIDTH - 1);
-                memset(buf.data(), 0, buf.size() * sizeof(float));
-                float *sum_b = alignPtr(buf.data(), CV_SIMD_WIDTH);
-                float *sum_g = sum_b + alignSize(size.width, CV_SIMD_WIDTH);
-                float *sum_r = sum_g + alignSize(size.width, CV_SIMD_WIDTH);
-                float *wsum = sum_r + alignSize(size.width, CV_SIMD_WIDTH);
-#if CV_SIMD
-                v_float32 v_one = vx_setall_f32(1.f);
-                v_float32 sindex = vx_setall_f32(scale_index);
-#endif
-                for (k = 0; k < maxk; k++)
-                {
-                    const float* ksptr = sptr + space_ofs[k];
-                    const float* rsptr = sptr;
-                    j = 0;
-#if CV_SIMD
-                    v_float32 kweight = vx_setall_f32(space_weight[k]);
-                    for (; j <= size.width - v_float32::nlanes; j += v_float32::nlanes, ksptr += 3*v_float32::nlanes, rsptr += 3*v_float32::nlanes)
-                    {
-                        v_float32 kb, kg, kr, rb, rg, rr;
-                        v_load_deinterleave(ksptr, kb, kg, kr);
-                        v_load_deinterleave(rsptr, rb, rg, rr);
-
-                        v_float32 alpha = (v_absdiff(kb, rb) + v_absdiff(kg, rg) + v_absdiff(kr, rr)) * sindex;
-                        v_int32 idx = v_trunc(alpha);
-                        alpha -= v_cvt_f32(idx);
-
-                        v_float32 w = kweight * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one - alpha));
-                        v_store_aligned(wsum + j, vx_load_aligned(wsum + j) + w);
-                        v_store_aligned(sum_b + j, v_muladd(kb, w, vx_load_aligned(sum_b + j)));
-                        v_store_aligned(sum_g + j, v_muladd(kg, w, vx_load_aligned(sum_g + j)));
-                        v_store_aligned(sum_r + j, v_muladd(kr, w, vx_load_aligned(sum_r + j)));
-                    }
-#endif
-                    for (; j < size.width; j++, ksptr += 3, rsptr += 3)
-                    {
-                        float b = ksptr[0], g = ksptr[1], r = ksptr[2];
-                        float alpha = (std::abs(b - rsptr[0]) + std::abs(g - rsptr[1]) + std::abs(r - rsptr[2])) * scale_index;
-                        int idx = cvFloor(alpha);
-                        alpha -= idx;
-                        float w = space_weight[k] * (expLUT[idx] + alpha*(expLUT[idx + 1] - expLUT[idx]));
-                        wsum[j] += w;
-                        sum_b[j] += b*w;
-                        sum_g[j] += g*w;
-                        sum_r[j] += r*w;
-                    }
-                }
-                j = 0;
-#if CV_SIMD
-                for (; j <= size.width - v_float32::nlanes; j += v_float32::nlanes, dptr += 3*v_float32::nlanes)
-                {
-                    v_float32 w = v_one / vx_load_aligned(wsum + j);
-                    v_store_interleave(dptr, vx_load_aligned(sum_b + j) * w, vx_load_aligned(sum_g + j) * w, vx_load_aligned(sum_r + j) * w);
-                }
-#endif
-                for (; j < size.width; j++)
-                {
-                    CV_DbgAssert(fabs(wsum[j]) > 0);
-                    wsum[j] = 1.f / wsum[j];
-                    *(dptr++) = sum_b[j] * wsum[j];
-                    *(dptr++) = sum_g[j] * wsum[j];
-                    *(dptr++) = sum_r[j] * wsum[j];
-                }
-            }
-        }
-#if CV_SIMD
-        vx_cleanup();
-#endif
-    }
-
-private:
-    int cn, radius, maxk, *space_ofs;
-    const Mat* temp;
-    Mat *dest;
-    float scale_index, *space_weight, *expLUT;
-};
-
-
-static void
-bilateralFilter_32f( const Mat& src, Mat& dst, int d,
-                     double sigma_color, double sigma_space,
-                     int borderType )
-{
-    int cn = src.channels();
-    int i, j, maxk, radius;
-    double minValSrc=-1, maxValSrc=1;
-    const int kExpNumBinsPerChannel = 1 << 12;
-    int kExpNumBins = 0;
-    float lastExpVal = 1.f;
-    float len, scale_index;
-    Size size = src.size();
-
-    CV_Assert( (src.type() == CV_32FC1 || src.type() == CV_32FC3) && src.data != dst.data );
-
-    if( sigma_color <= 0 )
-        sigma_color = 1;
-    if( sigma_space <= 0 )
-        sigma_space = 1;
-
-    double gauss_color_coeff = -0.5/(sigma_color*sigma_color);
-    double gauss_space_coeff = -0.5/(sigma_space*sigma_space);
-
-    if( d <= 0 )
-        radius = cvRound(sigma_space*1.5);
-    else
-        radius = d/2;
-    radius = MAX(radius, 1);
-    d = radius*2 + 1;
-    // compute the min/max range for the input image (even if multichannel)
-
-    minMaxLoc( src.reshape(1), &minValSrc, &maxValSrc );
-    if(std::abs(minValSrc - maxValSrc) < FLT_EPSILON)
-    {
-        src.copyTo(dst);
-        return;
-    }
-
-    // temporary copy of the image with borders for easy processing
-    Mat temp;
-    copyMakeBorder( src, temp, radius, radius, radius, radius, borderType );
-    const double insteadNaNValue = -5. * sigma_color;
-    patchNaNs( temp, insteadNaNValue ); // this replacement of NaNs makes the assumption that depth values are nonnegative
-                                        // TODO: make insteadNaNValue avalible in the outside function interface to control the cases breaking the assumption
-    // allocate lookup tables
-    std::vector<float> _space_weight(d*d);
-    std::vector<int> _space_ofs(d*d);
-    float* space_weight = &_space_weight[0];
-    int* space_ofs = &_space_ofs[0];
-
-    // assign a length which is slightly more than needed
-    len = (float)(maxValSrc - minValSrc) * cn;
-    kExpNumBins = kExpNumBinsPerChannel * cn;
-    std::vector<float> _expLUT(kExpNumBins+2);
-    float* expLUT = &_expLUT[0];
-
-    scale_index = kExpNumBins/len;
-
-    // initialize the exp LUT
-    for( i = 0; i < kExpNumBins+2; i++ )
-    {
-        if( lastExpVal > 0.f )
-        {
-            double val =  i / scale_index;
-            expLUT[i] = (float)std::exp(val * val * gauss_color_coeff);
-            lastExpVal = expLUT[i];
-        }
-        else
-            expLUT[i] = 0.f;
-    }
-
-    // initialize space-related bilateral filter coefficients
-    for( i = -radius, maxk = 0; i <= radius; i++ )
-        for( j = -radius; j <= radius; j++ )
-        {
-            double r = std::sqrt((double)i*i + (double)j*j);
-            if( r > radius )
-                continue;
-            space_weight[maxk] = (float)std::exp(r*r*gauss_space_coeff);
-            space_ofs[maxk++] = (int)(i*(temp.step/sizeof(float)) + j*cn);
-        }
-
-    // parallel_for usage
-
-    BilateralFilter_32f_Invoker body(cn, radius, maxk, space_ofs, temp, dst, scale_index, space_weight, expLUT);
-    parallel_for_(Range(0, size.height), body, dst.total()/(double)(1<<16));
-}
-
-#ifdef HAVE_IPP
-#define IPP_BILATERAL_PARALLEL 1
-
-#ifdef HAVE_IPP_IW
-class ipp_bilateralFilterParallel: public ParallelLoopBody
-{
-public:
-    ipp_bilateralFilterParallel(::ipp::IwiImage &_src, ::ipp::IwiImage &_dst, int _radius, Ipp32f _valSquareSigma, Ipp32f _posSquareSigma, ::ipp::IwiBorderType _borderType, bool *_ok):
-        src(_src), dst(_dst)
-    {
-        pOk = _ok;
-
-        radius          = _radius;
-        valSquareSigma  = _valSquareSigma;
-        posSquareSigma  = _posSquareSigma;
-        borderType      = _borderType;
-
-        *pOk = true;
-    }
-    ~ipp_bilateralFilterParallel() {}
-
-    virtual void operator() (const Range& range) const CV_OVERRIDE
-    {
-        if(*pOk == false)
-            return;
-
-        try
-        {
-            ::ipp::IwiTile tile = ::ipp::IwiRoi(0, range.start, dst.m_size.width, range.end - range.start);
-            CV_INSTRUMENT_FUN_IPP(::ipp::iwiFilterBilateral, src, dst, radius, valSquareSigma, posSquareSigma, ::ipp::IwDefault(), borderType, tile);
-        }
-        catch(const ::ipp::IwException &)
-        {
-            *pOk = false;
-            return;
-        }
-    }
-private:
-    ::ipp::IwiImage &src;
-    ::ipp::IwiImage &dst;
-
-    int                  radius;
-    Ipp32f               valSquareSigma;
-    Ipp32f               posSquareSigma;
-    ::ipp::IwiBorderType borderType;
-
-    bool  *pOk;
-    const ipp_bilateralFilterParallel& operator= (const ipp_bilateralFilterParallel&);
-};
-#endif
-
-static bool ipp_bilateralFilter(Mat &src, Mat &dst, int d, double sigmaColor, double sigmaSpace, int borderType)
-{
-#ifdef HAVE_IPP_IW
-    CV_INSTRUMENT_REGION_IPP();
-
-    int         radius         = IPP_MAX(((d <= 0)?cvRound(sigmaSpace*1.5):d/2), 1);
-    Ipp32f      valSquareSigma = (Ipp32f)((sigmaColor <= 0)?1:sigmaColor*sigmaColor);
-    Ipp32f      posSquareSigma = (Ipp32f)((sigmaSpace <= 0)?1:sigmaSpace*sigmaSpace);
-
-    // Acquire data and begin processing
-    try
-    {
-        ::ipp::IwiImage      iwSrc = ippiGetImage(src);
-        ::ipp::IwiImage      iwDst = ippiGetImage(dst);
-        ::ipp::IwiBorderSize borderSize(radius);
-        ::ipp::IwiBorderType ippBorder(ippiGetBorder(iwSrc, borderType, borderSize));
-        if(!ippBorder)
-            return false;
-
-        const int threads = ippiSuggestThreadsNum(iwDst, 2);
-        if(IPP_BILATERAL_PARALLEL && threads > 1) {
-            bool  ok      = true;
-            Range range(0, (int)iwDst.m_size.height);
-            ipp_bilateralFilterParallel invoker(iwSrc, iwDst, radius, valSquareSigma, posSquareSigma, ippBorder, &ok);
-            if(!ok)
-                return false;
-
-            parallel_for_(range, invoker, threads*4);
-
-            if(!ok)
-                return false;
-        } else {
-            CV_INSTRUMENT_FUN_IPP(::ipp::iwiFilterBilateral, iwSrc, iwDst, radius, valSquareSigma, posSquareSigma, ::ipp::IwDefault(), ippBorder);
-        }
-    }
-    catch (const ::ipp::IwException &)
-    {
-        return false;
-    }
-    return true;
-#else
-    CV_UNUSED(src); CV_UNUSED(dst); CV_UNUSED(d); CV_UNUSED(sigmaColor); CV_UNUSED(sigmaSpace); CV_UNUSED(borderType);
-    return false;
-#endif
-}
-#endif
-
-}
-
-void cv::bilateralFilter( InputArray _src, OutputArray _dst, int d,
-                      double sigmaColor, double sigmaSpace,
-                      int borderType )
-{
-    CV_INSTRUMENT_REGION();
-
-    _dst.create( _src.size(), _src.type() );
-
-    CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat(),
-               ocl_bilateralFilter_8u(_src, _dst, d, sigmaColor, sigmaSpace, borderType))
-
-    Mat src = _src.getMat(), dst = _dst.getMat();
-
-    CV_IPP_RUN_FAST(ipp_bilateralFilter(src, dst, d, sigmaColor, sigmaSpace, borderType));
-
-    if( src.depth() == CV_8U )
-        bilateralFilter_8u( src, dst, d, sigmaColor, sigmaSpace, borderType );
-    else if( src.depth() == CV_32F )
-        bilateralFilter_32f( src, dst, d, sigmaColor, sigmaSpace, borderType );
-    else
-        CV_Error( CV_StsUnsupportedFormat,
-        "Bilateral filtering is only implemented for 8u and 32f images" );
-}
-
 //////////////////////////////////////////////////////////////////////////////////////////
 
 CV_IMPL void