From 28fd967148920ead4b55de4df3354c76bedf3cb3 Mon Sep 17 00:00:00 2001 From: Vitaly Tuzov Date: Fri, 9 Nov 2018 15:27:30 +0300 Subject: [PATCH 1/2] Updated bilateralFilter implementations to use wide universal intrinsics --- .../include/opencv2/core/hal/intrin_avx.hpp | 19 +- modules/imgproc/src/smooth.cpp | 492 ++++++++---------- 2 files changed, 230 insertions(+), 281 deletions(-) diff --git a/modules/core/include/opencv2/core/hal/intrin_avx.hpp b/modules/core/include/opencv2/core/hal/intrin_avx.hpp index 58b3e7fae7..f8cc7a4d00 100644 --- a/modules/core/include/opencv2/core/hal/intrin_avx.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_avx.hpp @@ -1363,25 +1363,22 @@ inline v_float64x4 v_cvt_f64_high(const v_float32x8& a) inline v_int32x8 v_lut(const int* tab, const v_int32x8& idxvec) { - int CV_DECL_ALIGNED(32) idx[8]; - v_store_aligned(idx, idxvec); - return v_int32x8(_mm256_setr_epi32(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]], - tab[idx[4]], tab[idx[5]], tab[idx[6]], tab[idx[7]])); + return v_int32x8(_mm256_i32gather_epi32(tab, idxvec.val, 4)); +} + +inline v_uint32x8 v_lut(const unsigned* tab, const v_int32x8& idxvec) +{ + return v_reinterpret_as_u32(v_lut((const int *)tab, idxvec)); } inline v_float32x8 v_lut(const float* tab, const v_int32x8& idxvec) { - int CV_DECL_ALIGNED(32) idx[8]; - v_store_aligned(idx, idxvec); - return v_float32x8(_mm256_setr_ps(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]], - tab[idx[4]], tab[idx[5]], tab[idx[6]], tab[idx[7]])); + return v_float32x8(_mm256_i32gather_ps(tab, idxvec.val, 4)); } inline v_float64x4 v_lut(const double* tab, const v_int32x8& idxvec) { - int CV_DECL_ALIGNED(32) idx[8]; - v_store_aligned(idx, idxvec); - return v_float64x4(_mm256_setr_pd(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]])); + return v_float64x4(_mm256_i32gather_pd(tab, _mm256_castsi256_si128(idxvec.val), 8)); } inline void v_lut_deinterleave(const float* tab, const v_int32x8& idxvec, v_float32x8& x, v_float32x8& y) diff --git a/modules/imgproc/src/smooth.cpp b/modules/imgproc/src/smooth.cpp index a4292b6089..e2fe6a58d5 100644 --- a/modules/imgproc/src/smooth.cpp +++ b/modules/imgproc/src/smooth.cpp @@ -2527,10 +2527,6 @@ public: { int i, j, cn = dest->channels(), k; Size size = dest->size(); -#if CV_SIMD128 - int CV_DECL_ALIGNED(16) buf[4]; - bool haveSIMD128 = hasSIMD128(); -#endif for( i = range.start; i < range.end; i++ ) { @@ -2539,154 +2535,153 @@ public: if( cn == 1 ) { - for( j = 0; j < size.width; j++ ) + AutoBuffer buf(alignSize(size.width, CV_SIMD_WIDTH) + size.width + CV_SIMD_WIDTH - 1); + memset(buf.data(), 0, buf.size() * sizeof(float)); + float *sum = alignPtr(buf.data(), CV_SIMD_WIDTH); + float *wsum = sum + alignSize(size.width, CV_SIMD_WIDTH); + for( k = 0; k < maxk; k++ ) { - float sum = 0, wsum = 0; - int val0 = sptr[j]; - k = 0; -#if CV_SIMD128 - if( haveSIMD128 ) + const uchar* ksptr = sptr + space_ofs[k]; + j = 0; +#if CV_SIMD + v_float32 kweight = vx_setall_f32(space_weight[k]); + for (; j <= size.width - v_float32::nlanes; j += v_float32::nlanes) { - v_float32x4 _val0 = v_setall_f32(static_cast(val0)); - v_float32x4 vsumw = v_setzero_f32(); - v_float32x4 vsumc = v_setzero_f32(); - - for( ; k <= maxk - 4; k += 4 ) - { - v_float32x4 _valF = v_float32x4(sptr[j + space_ofs[k]], - sptr[j + space_ofs[k + 1]], - sptr[j + space_ofs[k + 2]], - sptr[j + space_ofs[k + 3]]); - v_float32x4 _val = v_abs(_valF - _val0); - v_store(buf, v_round(_val)); - - v_float32x4 _cw = v_float32x4(color_weight[buf[0]], - color_weight[buf[1]], - color_weight[buf[2]], - color_weight[buf[3]]); - v_float32x4 _sw = v_load(space_weight+k); -#if defined(_MSC_VER) && _MSC_VER == 1700/* MSVS 2012 */ && CV_AVX - // details: https://github.com/opencv/opencv/issues/11004 - vsumw += _cw * _sw; - vsumc += _cw * _sw * _valF; -#else - v_float32x4 _w = _cw * _sw; - _cw = _w * _valF; - - vsumw += _w; - vsumc += _cw; -#endif - } - float *bufFloat = (float*)buf; - v_float32x4 sum4 = v_reduce_sum4(vsumw, vsumc, vsumw, vsumc); - v_store(bufFloat, sum4); - sum += bufFloat[1]; - wsum += bufFloat[0]; + v_uint32 val = vx_load_expand_q(ksptr + j); + v_float32 w = kweight * v_lut(color_weight, v_reinterpret_as_s32(v_absdiff(val, vx_load_expand_q(sptr + j)))); + v_store_aligned(wsum + j, vx_load_aligned(wsum + j) + w); + v_store_aligned(sum + j, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val)), w, vx_load_aligned(sum + j))); } #endif - for( ; k < maxk; k++ ) + for (; j < size.width; j++) { - int val = sptr[j + space_ofs[k]]; - float w = space_weight[k]*color_weight[std::abs(val - val0)]; - sum += val*w; - wsum += w; + int val = ksptr[j]; + float w = space_weight[k] * color_weight[std::abs(val - sptr[j])]; + wsum[j] += w; + sum[j] += val * w; } + } + j = 0; +#if CV_SIMD + for (; j <= size.width - 2*v_float32::nlanes; j += 2*v_float32::nlanes) + v_pack_u_store(dptr + j, v_pack(v_round(vx_load_aligned(sum + j ) / vx_load_aligned(wsum + j )), + v_round(vx_load_aligned(sum + j + v_float32::nlanes) / vx_load_aligned(wsum + j + v_float32::nlanes)))); +#endif + for (; j < size.width; j++) + { // overflow is not possible here => there is no need to use cv::saturate_cast - CV_DbgAssert(fabs(wsum) > 0); - dptr[j] = (uchar)cvRound(sum/wsum); + CV_DbgAssert(fabs(wsum[j]) > 0); + dptr[j] = (uchar)cvRound(sum[j]/wsum[j]); } } else { assert( cn == 3 ); - for( j = 0; j < size.width*3; j += 3 ) + AutoBuffer buf(alignSize(size.width, CV_SIMD_WIDTH)*3 + size.width + CV_SIMD_WIDTH - 1); + memset(buf.data(), 0, buf.size() * sizeof(float)); + float *sum_b = alignPtr(buf.data(), CV_SIMD_WIDTH); + float *sum_g = sum_b + alignSize(size.width, CV_SIMD_WIDTH); + float *sum_r = sum_g + alignSize(size.width, CV_SIMD_WIDTH); + float *wsum = sum_r + alignSize(size.width, CV_SIMD_WIDTH); + for(k = 0; k < maxk; k++ ) { - float sum_b = 0, sum_g = 0, sum_r = 0, wsum = 0; - int b0 = sptr[j], g0 = sptr[j+1], r0 = sptr[j+2]; - k = 0; -#if CV_SIMD128 - if( haveSIMD128 ) + const uchar* ksptr = sptr + space_ofs[k]; + const uchar* rsptr = sptr; + j = 0; +#if CV_SIMD + v_float32 kweight = vx_setall_f32(space_weight[k]); + for (; j <= size.width - v_uint8::nlanes; j += v_uint8::nlanes, ksptr += 3*v_uint8::nlanes, rsptr += 3*v_uint8::nlanes) { - v_float32x4 vsumw = v_setzero_f32(); - v_float32x4 vsumb = v_setzero_f32(); - v_float32x4 vsumg = v_setzero_f32(); - v_float32x4 vsumr = v_setzero_f32(); - const v_float32x4 _b0 = v_setall_f32(static_cast(b0)); - const v_float32x4 _g0 = v_setall_f32(static_cast(g0)); - const v_float32x4 _r0 = v_setall_f32(static_cast(r0)); - - for( ; k <= maxk - 4; k += 4 ) - { - const uchar* const sptr_k0 = sptr + j + space_ofs[k]; - const uchar* const sptr_k1 = sptr + j + space_ofs[k+1]; - const uchar* const sptr_k2 = sptr + j + space_ofs[k+2]; - const uchar* const sptr_k3 = sptr + j + space_ofs[k+3]; - - v_float32x4 __b = v_cvt_f32(v_reinterpret_as_s32(v_load_expand_q(sptr_k0))); - v_float32x4 __g = v_cvt_f32(v_reinterpret_as_s32(v_load_expand_q(sptr_k1))); - v_float32x4 __r = v_cvt_f32(v_reinterpret_as_s32(v_load_expand_q(sptr_k2))); - v_float32x4 __z = v_cvt_f32(v_reinterpret_as_s32(v_load_expand_q(sptr_k3))); - v_float32x4 _b, _g, _r, _z; - - v_transpose4x4(__b, __g, __r, __z, _b, _g, _r, _z); - - v_float32x4 bt = v_abs(_b -_b0); - v_float32x4 gt = v_abs(_g -_g0); - v_float32x4 rt = v_abs(_r -_r0); - - bt = rt + bt + gt; - v_store(buf, v_round(bt)); - - v_float32x4 _w = v_float32x4(color_weight[buf[0]],color_weight[buf[1]], - color_weight[buf[2]],color_weight[buf[3]]); - v_float32x4 _sw = v_load(space_weight+k); - -#if defined(_MSC_VER) && _MSC_VER == 1700/* MSVS 2012 */ && CV_AVX - // details: https://github.com/opencv/opencv/issues/11004 - vsumw += _w * _sw; - vsumb += _w * _sw * _b; - vsumg += _w * _sw * _g; - vsumr += _w * _sw * _r; -#else - _w *= _sw; - _b *= _w; - _g *= _w; - _r *= _w; - - vsumw += _w; - vsumb += _b; - vsumg += _g; - vsumr += _r; -#endif - } - float *bufFloat = (float*)buf; - v_float32x4 sum4 = v_reduce_sum4(vsumw, vsumb, vsumg, vsumr); - v_store(bufFloat, sum4); - wsum += bufFloat[0]; - sum_b += bufFloat[1]; - sum_g += bufFloat[2]; - sum_r += bufFloat[3]; + v_uint8 kb, kg, kr, rb, rg, rr; + v_load_deinterleave(ksptr, kb, kg, kr); + v_load_deinterleave(rsptr, rb, rg, rr); + + v_uint16 b_l, b_h, g_l, g_h, r_l, r_h; + v_expand(v_absdiff(kb, rb), b_l, b_h); + v_expand(v_absdiff(kg, rg), g_l, g_h); + v_expand(v_absdiff(kr, rr), r_l, r_h); + + v_uint32 val0, val1, val2, val3; + v_expand(b_l + g_l + r_l, val0, val1); + v_expand(b_h + g_h + r_h, val2, val3); + + v_expand(kb, b_l, b_h); + v_expand(kg, g_l, g_h); + v_expand(kr, r_l, r_h); + + v_float32 w0 = kweight * v_lut(color_weight, v_reinterpret_as_s32(val0)); + v_float32 w1 = kweight * v_lut(color_weight, v_reinterpret_as_s32(val1)); + v_float32 w2 = kweight * v_lut(color_weight, v_reinterpret_as_s32(val2)); + v_float32 w3 = kweight * v_lut(color_weight, v_reinterpret_as_s32(val3)); + v_store_aligned(wsum + j , w0 + vx_load_aligned(wsum + j)); + v_store_aligned(wsum + j + v_float32::nlanes, w1 + vx_load_aligned(wsum + j + v_float32::nlanes)); + v_store_aligned(wsum + j + 2*v_float32::nlanes, w2 + vx_load_aligned(wsum + j + 2*v_float32::nlanes)); + v_store_aligned(wsum + j + 3*v_float32::nlanes, w3 + vx_load_aligned(wsum + j + 3*v_float32::nlanes)); + v_expand(b_l, val0, val1); + v_expand(b_h, val2, val3); + v_store_aligned(sum_b + j , v_muladd(v_cvt_f32(v_reinterpret_as_s32(val0)), w0, vx_load_aligned(sum_b + j))); + v_store_aligned(sum_b + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val1)), w1, vx_load_aligned(sum_b + j + v_float32::nlanes))); + v_store_aligned(sum_b + j + 2*v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val2)), w2, vx_load_aligned(sum_b + j + 2*v_float32::nlanes))); + v_store_aligned(sum_b + j + 3*v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val3)), w3, vx_load_aligned(sum_b + j + 3*v_float32::nlanes))); + v_expand(g_l, val0, val1); + v_expand(g_h, val2, val3); + v_store_aligned(sum_g + j , v_muladd(v_cvt_f32(v_reinterpret_as_s32(val0)), w0, vx_load_aligned(sum_g + j))); + v_store_aligned(sum_g + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val1)), w1, vx_load_aligned(sum_g + j + v_float32::nlanes))); + v_store_aligned(sum_g + j + 2*v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val2)), w2, vx_load_aligned(sum_g + j + 2*v_float32::nlanes))); + v_store_aligned(sum_g + j + 3*v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val3)), w3, vx_load_aligned(sum_g + j + 3*v_float32::nlanes))); + v_expand(r_l, val0, val1); + v_expand(r_h, val2, val3); + v_store_aligned(sum_r + j , v_muladd(v_cvt_f32(v_reinterpret_as_s32(val0)), w0, vx_load_aligned(sum_r + j))); + v_store_aligned(sum_r + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val1)), w1, vx_load_aligned(sum_r + j + v_float32::nlanes))); + v_store_aligned(sum_r + j + 2*v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val2)), w2, vx_load_aligned(sum_r + j + 2*v_float32::nlanes))); + v_store_aligned(sum_r + j + 3*v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val3)), w3, vx_load_aligned(sum_r + j + 3*v_float32::nlanes))); } #endif - - for( ; k < maxk; k++ ) + for(; j < size.width; j++, ksptr += 3, rsptr += 3) { - const uchar* sptr_k = sptr + j + space_ofs[k]; - int b = sptr_k[0], g = sptr_k[1], r = sptr_k[2]; - float w = space_weight[k]*color_weight[std::abs(b - b0) + - std::abs(g - g0) + std::abs(r - r0)]; - sum_b += b*w; sum_g += g*w; sum_r += r*w; - wsum += w; + int b = ksptr[0], g = ksptr[1], r = ksptr[2]; + float w = space_weight[k]*color_weight[std::abs(b - rsptr[0]) + std::abs(g - rsptr[1]) + std::abs(r - rsptr[2])]; + wsum[j] += w; + sum_b[j] += b*w; sum_g[j] += g*w; sum_r[j] += r*w; } - CV_DbgAssert(fabs(wsum) > 0); - wsum = 1.f/wsum; - b0 = cvRound(sum_b*wsum); - g0 = cvRound(sum_g*wsum); - r0 = cvRound(sum_r*wsum); - dptr[j] = (uchar)b0; dptr[j+1] = (uchar)g0; dptr[j+2] = (uchar)r0; + } + j = 0; +#if CV_SIMD + v_float32 v_one = vx_setall_f32(1.f); + for(; j <= size.width - v_uint8::nlanes; j += v_uint8::nlanes, dptr += 3*v_uint8::nlanes) + { + v_float32 w0 = v_one / vx_load_aligned(wsum + j); + v_float32 w1 = v_one / vx_load_aligned(wsum + j + v_float32::nlanes); + v_float32 w2 = v_one / vx_load_aligned(wsum + j + 2*v_float32::nlanes); + v_float32 w3 = v_one / vx_load_aligned(wsum + j + 3*v_float32::nlanes); + + v_store_interleave(dptr, v_pack_u(v_pack(v_round(w0 * vx_load_aligned(sum_b + j)), + v_round(w1 * vx_load_aligned(sum_b + j + v_float32::nlanes))), + v_pack(v_round(w2 * vx_load_aligned(sum_b + j + 2*v_float32::nlanes)), + v_round(w3 * vx_load_aligned(sum_b + j + 3*v_float32::nlanes)))), + v_pack_u(v_pack(v_round(w0 * vx_load_aligned(sum_g + j)), + v_round(w1 * vx_load_aligned(sum_g + j + v_float32::nlanes))), + v_pack(v_round(w2 * vx_load_aligned(sum_g + j + 2*v_float32::nlanes)), + v_round(w3 * vx_load_aligned(sum_g + j + 3*v_float32::nlanes)))), + v_pack_u(v_pack(v_round(w0 * vx_load_aligned(sum_r + j)), + v_round(w1 * vx_load_aligned(sum_r + j + v_float32::nlanes))), + v_pack(v_round(w2 * vx_load_aligned(sum_r + j + 2*v_float32::nlanes)), + v_round(w3 * vx_load_aligned(sum_r + j + 3*v_float32::nlanes))))); + } +#endif + for(; j < size.width; j++) + { + CV_DbgAssert(fabs(wsum[j]) > 0); + wsum[j] = 1.f/wsum[j]; + *(dptr++) = (uchar)cvRound(sum_b[j]*wsum[j]); + *(dptr++) = (uchar)cvRound(sum_g[j]*wsum[j]); + *(dptr++) = (uchar)cvRound(sum_r[j]*wsum[j]); } } } +#if CV_SIMD + vx_cleanup(); +#endif } private: @@ -2867,10 +2862,6 @@ public: { int i, j, k; Size size = dest->size(); -#if CV_SIMD128 - int CV_DECL_ALIGNED(16) idxBuf[4]; - bool haveSIMD128 = hasSIMD128(); -#endif for( i = range.start; i < range.end; i++ ) { @@ -2879,165 +2870,126 @@ public: if( cn == 1 ) { - for( j = 0; j < size.width; j++ ) + AutoBuffer buf(alignSize(size.width, CV_SIMD_WIDTH) + size.width + CV_SIMD_WIDTH - 1); + memset(buf.data(), 0, buf.size() * sizeof(float)); + float *sum = alignPtr(buf.data(), CV_SIMD_WIDTH); + float *wsum = sum + alignSize(size.width, CV_SIMD_WIDTH); +#if CV_SIMD + v_float32 v_one = vx_setall_f32(1.f); + v_float32 sindex = vx_setall_f32(scale_index); +#endif + for( k = 0; k < maxk; k++ ) { - float sum = 0, wsum = 0; - float val0 = sptr[j]; - k = 0; -#if CV_SIMD128 - if( haveSIMD128 ) + const float* ksptr = sptr + space_ofs[k]; + j = 0; +#if CV_SIMD + v_float32 kweight = vx_setall_f32(space_weight[k]); + for (; j <= size.width - v_float32::nlanes; j += v_float32::nlanes) { - v_float32x4 vecwsum = v_setzero_f32(); - v_float32x4 vecvsum = v_setzero_f32(); - const v_float32x4 _val0 = v_setall_f32(sptr[j]); - const v_float32x4 _scale_index = v_setall_f32(scale_index); - - for (; k <= maxk - 4; k += 4) - { - v_float32x4 _sw = v_load(space_weight + k); - v_float32x4 _val = v_float32x4(sptr[j + space_ofs[k]], - sptr[j + space_ofs[k + 1]], - sptr[j + space_ofs[k + 2]], - sptr[j + space_ofs[k + 3]]); - v_float32x4 _alpha = v_abs(_val - _val0) * _scale_index; - - v_int32x4 _idx = v_round(_alpha); - v_store(idxBuf, _idx); - _alpha -= v_cvt_f32(_idx); - - v_float32x4 _explut = v_float32x4(expLUT[idxBuf[0]], - expLUT[idxBuf[1]], - expLUT[idxBuf[2]], - expLUT[idxBuf[3]]); - v_float32x4 _explut1 = v_float32x4(expLUT[idxBuf[0] + 1], - expLUT[idxBuf[1] + 1], - expLUT[idxBuf[2] + 1], - expLUT[idxBuf[3] + 1]); - - v_float32x4 _w = _sw * (_explut + (_alpha * (_explut1 - _explut))); - _val *= _w; - - vecwsum += _w; - vecvsum += _val; - } - float *bufFloat = (float*)idxBuf; - v_float32x4 sum4 = v_reduce_sum4(vecwsum, vecvsum, vecwsum, vecvsum); - v_store(bufFloat, sum4); - sum += bufFloat[1]; - wsum += bufFloat[0]; + v_float32 val = vx_load(ksptr + j); + + v_float32 alpha = v_absdiff(val, vx_load(sptr + j)) * sindex; + v_int32 idx = v_trunc(alpha); + alpha -= v_cvt_f32(idx); + + v_float32 w = kweight * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one-alpha)); + v_store_aligned(wsum + j, vx_load_aligned(wsum + j) + w); + v_store_aligned(sum + j, v_muladd(val, w, vx_load_aligned(sum + j))); } #endif - - for( ; k < maxk; k++ ) + for (; j < size.width; j++) { - float val = sptr[j + space_ofs[k]]; - float alpha = (float)(std::abs(val - val0)*scale_index); + float val = ksptr[j]; + float alpha = std::abs(val - sptr[j]) * scale_index; int idx = cvFloor(alpha); alpha -= idx; - float w = space_weight[k]*(expLUT[idx] + alpha*(expLUT[idx+1] - expLUT[idx])); - sum += val*w; - wsum += w; + float w = space_weight[k] * (expLUT[idx] + alpha*(expLUT[idx+1] - expLUT[idx])); + wsum[j] += w; + sum[j] += val * w; } - CV_DbgAssert(fabs(wsum) > 0); - dptr[j] = (float)(sum/wsum); + } + j = 0; +#if CV_SIMD + for (; j <= size.width - v_float32::nlanes; j += v_float32::nlanes) + v_store(dptr + j, vx_load_aligned(sum + j) / vx_load_aligned(wsum + j)); +#endif + for (; j < size.width; j++) + { + CV_DbgAssert(fabs(wsum[j]) > 0); + dptr[j] = sum[j] / wsum[j]; } } else { CV_Assert( cn == 3 ); - for( j = 0; j < size.width*3; j += 3 ) + AutoBuffer buf(alignSize(size.width, CV_SIMD_WIDTH)*3 + size.width + CV_SIMD_WIDTH - 1); + memset(buf.data(), 0, buf.size() * sizeof(float)); + float *sum_b = alignPtr(buf.data(), CV_SIMD_WIDTH); + float *sum_g = sum_b + alignSize(size.width, CV_SIMD_WIDTH); + float *sum_r = sum_g + alignSize(size.width, CV_SIMD_WIDTH); + float *wsum = sum_r + alignSize(size.width, CV_SIMD_WIDTH); +#if CV_SIMD + v_float32 v_one = vx_setall_f32(1.f); + v_float32 sindex = vx_setall_f32(scale_index); +#endif + for (k = 0; k < maxk; k++) { - float sum_b = 0, sum_g = 0, sum_r = 0, wsum = 0; - float b0 = sptr[j], g0 = sptr[j+1], r0 = sptr[j+2]; - k = 0; -#if CV_SIMD128 - if( haveSIMD128 ) + const float* ksptr = sptr + space_ofs[k]; + const float* rsptr = sptr; + j = 0; +#if CV_SIMD + v_float32 kweight = vx_setall_f32(space_weight[k]); + for (; j <= size.width - v_float32::nlanes; j += v_float32::nlanes, ksptr += 3*v_float32::nlanes, rsptr += 3*v_float32::nlanes) { - v_float32x4 sumw = v_setzero_f32(); - v_float32x4 sumb = v_setzero_f32(); - v_float32x4 sumg = v_setzero_f32(); - v_float32x4 sumr = v_setzero_f32(); - const v_float32x4 _b0 = v_setall_f32(b0); - const v_float32x4 _g0 = v_setall_f32(g0); - const v_float32x4 _r0 = v_setall_f32(r0); - const v_float32x4 _scale_index = v_setall_f32(scale_index); - - for( ; k <= maxk-4; k += 4 ) - { - v_float32x4 _sw = v_load(space_weight + k); - - const float* const sptr_k0 = sptr + j + space_ofs[k]; - const float* const sptr_k1 = sptr + j + space_ofs[k+1]; - const float* const sptr_k2 = sptr + j + space_ofs[k+2]; - const float* const sptr_k3 = sptr + j + space_ofs[k+3]; - - v_float32x4 _v0 = v_load(sptr_k0); - v_float32x4 _v1 = v_load(sptr_k1); - v_float32x4 _v2 = v_load(sptr_k2); - v_float32x4 _v3 = v_load(sptr_k3); - v_float32x4 _b, _g, _r, _dummy; - - v_transpose4x4(_v0, _v1, _v2, _v3, _b, _g, _r, _dummy); - - v_float32x4 _bt = v_abs(_b - _b0); - v_float32x4 _gt = v_abs(_g - _g0); - v_float32x4 _rt = v_abs(_r - _r0); - v_float32x4 _alpha = _scale_index * (_bt + _gt + _rt); - - v_int32x4 _idx = v_round(_alpha); - v_store((int*)idxBuf, _idx); - _alpha -= v_cvt_f32(_idx); - - v_float32x4 _explut = v_float32x4(expLUT[idxBuf[0]], - expLUT[idxBuf[1]], - expLUT[idxBuf[2]], - expLUT[idxBuf[3]]); - v_float32x4 _explut1 = v_float32x4(expLUT[idxBuf[0] + 1], - expLUT[idxBuf[1] + 1], - expLUT[idxBuf[2] + 1], - expLUT[idxBuf[3] + 1]); - - v_float32x4 _w = _sw * (_explut + (_alpha * (_explut1 - _explut))); - - _b *= _w; - _g *= _w; - _r *= _w; - sumw += _w; - sumb += _b; - sumg += _g; - sumr += _r; - } - v_float32x4 sum4 = v_reduce_sum4(sumw, sumb, sumg, sumr); - float *bufFloat = (float*)idxBuf; - v_store(bufFloat, sum4); - wsum += bufFloat[0]; - sum_b += bufFloat[1]; - sum_g += bufFloat[2]; - sum_r += bufFloat[3]; + v_float32 kb, kg, kr, rb, rg, rr; + v_load_deinterleave(ksptr, kb, kg, kr); + v_load_deinterleave(rsptr, rb, rg, rr); + + v_float32 alpha = (v_absdiff(kb, rb) + v_absdiff(kg, rg) + v_absdiff(kr, rr)) * sindex; + v_int32 idx = v_trunc(alpha); + alpha -= v_cvt_f32(idx); + + v_float32 w = kweight * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one - alpha)); + v_store_aligned(wsum + j, vx_load_aligned(wsum + j) + w); + v_store_aligned(sum_b + j, v_muladd(kb, w, vx_load_aligned(sum_b + j))); + v_store_aligned(sum_g + j, v_muladd(kg, w, vx_load_aligned(sum_g + j))); + v_store_aligned(sum_r + j, v_muladd(kr, w, vx_load_aligned(sum_r + j))); } #endif - - for(; k < maxk; k++ ) + for (; j < size.width; j++, ksptr += 3, rsptr += 3) { - const float* sptr_k = sptr + j + space_ofs[k]; - float b = sptr_k[0], g = sptr_k[1], r = sptr_k[2]; - float alpha = (float)((std::abs(b - b0) + - std::abs(g - g0) + std::abs(r - r0))*scale_index); + float b = ksptr[0], g = ksptr[1], r = ksptr[2]; + float alpha = (std::abs(b - rsptr[0]) + std::abs(g - rsptr[1]) + std::abs(r - rsptr[2])) * scale_index; int idx = cvFloor(alpha); alpha -= idx; - float w = space_weight[k]*(expLUT[idx] + alpha*(expLUT[idx+1] - expLUT[idx])); - sum_b += b*w; sum_g += g*w; sum_r += r*w; - wsum += w; + float w = space_weight[k] * (expLUT[idx] + alpha*(expLUT[idx + 1] - expLUT[idx])); + wsum[j] += w; + sum_b[j] += b*w; + sum_g[j] += g*w; + sum_r[j] += r*w; } - CV_DbgAssert(fabs(wsum) > 0); - wsum = 1.f/wsum; - b0 = sum_b*wsum; - g0 = sum_g*wsum; - r0 = sum_r*wsum; - dptr[j] = b0; dptr[j+1] = g0; dptr[j+2] = r0; + } + j = 0; +#if CV_SIMD + for (; j <= size.width - v_float32::nlanes; j += v_float32::nlanes, dptr += 3*v_float32::nlanes) + { + v_float32 w = v_one / vx_load_aligned(wsum + j); + v_store_interleave(dptr, vx_load_aligned(sum_b + j) * w, vx_load_aligned(sum_g + j) * w, vx_load_aligned(sum_r + j) * w); + } +#endif + for (; j < size.width; j++) + { + CV_DbgAssert(fabs(wsum[j]) > 0); + wsum[j] = 1.f / wsum[j]; + *(dptr++) = sum_b[j] * wsum[j]; + *(dptr++) = sum_g[j] * wsum[j]; + *(dptr++) = sum_r[j] * wsum[j]; } } } +#if CV_SIMD + vx_cleanup(); +#endif } private: From 2dd98e7cc6cf39c3646ab3e2a0f85d4872773c31 Mon Sep 17 00:00:00 2001 From: Vitaly Tuzov Date: Fri, 9 Nov 2018 16:17:58 +0300 Subject: [PATCH 2/2] bilateralFilter implementation moved to separate file --- modules/imgproc/src/bilateral_filter.cpp | 759 +++++++++++++++++++++++ modules/imgproc/src/smooth.cpp | 708 --------------------- 2 files changed, 759 insertions(+), 708 deletions(-) create mode 100644 modules/imgproc/src/bilateral_filter.cpp diff --git a/modules/imgproc/src/bilateral_filter.cpp b/modules/imgproc/src/bilateral_filter.cpp new file mode 100644 index 0000000000..5e39fa4de5 --- /dev/null +++ b/modules/imgproc/src/bilateral_filter.cpp @@ -0,0 +1,759 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, 2018, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 2014-2015, Itseez Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#include "precomp.hpp" + +#include + +#include "opencv2/core/hal/intrin.hpp" +#include "opencl_kernels_imgproc.hpp" + +/****************************************************************************************\ + Bilateral Filtering +\****************************************************************************************/ + +namespace cv +{ + +class BilateralFilter_8u_Invoker : + public ParallelLoopBody +{ +public: + BilateralFilter_8u_Invoker(Mat& _dest, const Mat& _temp, int _radius, int _maxk, + int* _space_ofs, float *_space_weight, float *_color_weight) : + temp(&_temp), dest(&_dest), radius(_radius), + maxk(_maxk), space_ofs(_space_ofs), space_weight(_space_weight), color_weight(_color_weight) + { + } + + virtual void operator() (const Range& range) const CV_OVERRIDE + { + int i, j, cn = dest->channels(), k; + Size size = dest->size(); + + for( i = range.start; i < range.end; i++ ) + { + const uchar* sptr = temp->ptr(i+radius) + radius*cn; + uchar* dptr = dest->ptr(i); + + if( cn == 1 ) + { + AutoBuffer buf(alignSize(size.width, CV_SIMD_WIDTH) + size.width + CV_SIMD_WIDTH - 1); + memset(buf.data(), 0, buf.size() * sizeof(float)); + float *sum = alignPtr(buf.data(), CV_SIMD_WIDTH); + float *wsum = sum + alignSize(size.width, CV_SIMD_WIDTH); + for( k = 0; k < maxk; k++ ) + { + const uchar* ksptr = sptr + space_ofs[k]; + j = 0; +#if CV_SIMD + v_float32 kweight = vx_setall_f32(space_weight[k]); + for (; j <= size.width - v_float32::nlanes; j += v_float32::nlanes) + { + v_uint32 val = vx_load_expand_q(ksptr + j); + v_float32 w = kweight * v_lut(color_weight, v_reinterpret_as_s32(v_absdiff(val, vx_load_expand_q(sptr + j)))); + v_store_aligned(wsum + j, vx_load_aligned(wsum + j) + w); + v_store_aligned(sum + j, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val)), w, vx_load_aligned(sum + j))); + } +#endif + for (; j < size.width; j++) + { + int val = ksptr[j]; + float w = space_weight[k] * color_weight[std::abs(val - sptr[j])]; + wsum[j] += w; + sum[j] += val * w; + } + } + j = 0; +#if CV_SIMD + for (; j <= size.width - 2*v_float32::nlanes; j += 2*v_float32::nlanes) + v_pack_u_store(dptr + j, v_pack(v_round(vx_load_aligned(sum + j ) / vx_load_aligned(wsum + j )), + v_round(vx_load_aligned(sum + j + v_float32::nlanes) / vx_load_aligned(wsum + j + v_float32::nlanes)))); +#endif + for (; j < size.width; j++) + { + // overflow is not possible here => there is no need to use cv::saturate_cast + CV_DbgAssert(fabs(wsum[j]) > 0); + dptr[j] = (uchar)cvRound(sum[j]/wsum[j]); + } + } + else + { + assert( cn == 3 ); + AutoBuffer buf(alignSize(size.width, CV_SIMD_WIDTH)*3 + size.width + CV_SIMD_WIDTH - 1); + memset(buf.data(), 0, buf.size() * sizeof(float)); + float *sum_b = alignPtr(buf.data(), CV_SIMD_WIDTH); + float *sum_g = sum_b + alignSize(size.width, CV_SIMD_WIDTH); + float *sum_r = sum_g + alignSize(size.width, CV_SIMD_WIDTH); + float *wsum = sum_r + alignSize(size.width, CV_SIMD_WIDTH); + for(k = 0; k < maxk; k++ ) + { + const uchar* ksptr = sptr + space_ofs[k]; + const uchar* rsptr = sptr; + j = 0; +#if CV_SIMD + v_float32 kweight = vx_setall_f32(space_weight[k]); + for (; j <= size.width - v_uint8::nlanes; j += v_uint8::nlanes, ksptr += 3*v_uint8::nlanes, rsptr += 3*v_uint8::nlanes) + { + v_uint8 kb, kg, kr, rb, rg, rr; + v_load_deinterleave(ksptr, kb, kg, kr); + v_load_deinterleave(rsptr, rb, rg, rr); + + v_uint16 b_l, b_h, g_l, g_h, r_l, r_h; + v_expand(v_absdiff(kb, rb), b_l, b_h); + v_expand(v_absdiff(kg, rg), g_l, g_h); + v_expand(v_absdiff(kr, rr), r_l, r_h); + + v_uint32 val0, val1, val2, val3; + v_expand(b_l + g_l + r_l, val0, val1); + v_expand(b_h + g_h + r_h, val2, val3); + + v_expand(kb, b_l, b_h); + v_expand(kg, g_l, g_h); + v_expand(kr, r_l, r_h); + + v_float32 w0 = kweight * v_lut(color_weight, v_reinterpret_as_s32(val0)); + v_float32 w1 = kweight * v_lut(color_weight, v_reinterpret_as_s32(val1)); + v_float32 w2 = kweight * v_lut(color_weight, v_reinterpret_as_s32(val2)); + v_float32 w3 = kweight * v_lut(color_weight, v_reinterpret_as_s32(val3)); + v_store_aligned(wsum + j , w0 + vx_load_aligned(wsum + j)); + v_store_aligned(wsum + j + v_float32::nlanes, w1 + vx_load_aligned(wsum + j + v_float32::nlanes)); + v_store_aligned(wsum + j + 2*v_float32::nlanes, w2 + vx_load_aligned(wsum + j + 2*v_float32::nlanes)); + v_store_aligned(wsum + j + 3*v_float32::nlanes, w3 + vx_load_aligned(wsum + j + 3*v_float32::nlanes)); + v_expand(b_l, val0, val1); + v_expand(b_h, val2, val3); + v_store_aligned(sum_b + j , v_muladd(v_cvt_f32(v_reinterpret_as_s32(val0)), w0, vx_load_aligned(sum_b + j))); + v_store_aligned(sum_b + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val1)), w1, vx_load_aligned(sum_b + j + v_float32::nlanes))); + v_store_aligned(sum_b + j + 2*v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val2)), w2, vx_load_aligned(sum_b + j + 2*v_float32::nlanes))); + v_store_aligned(sum_b + j + 3*v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val3)), w3, vx_load_aligned(sum_b + j + 3*v_float32::nlanes))); + v_expand(g_l, val0, val1); + v_expand(g_h, val2, val3); + v_store_aligned(sum_g + j , v_muladd(v_cvt_f32(v_reinterpret_as_s32(val0)), w0, vx_load_aligned(sum_g + j))); + v_store_aligned(sum_g + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val1)), w1, vx_load_aligned(sum_g + j + v_float32::nlanes))); + v_store_aligned(sum_g + j + 2*v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val2)), w2, vx_load_aligned(sum_g + j + 2*v_float32::nlanes))); + v_store_aligned(sum_g + j + 3*v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val3)), w3, vx_load_aligned(sum_g + j + 3*v_float32::nlanes))); + v_expand(r_l, val0, val1); + v_expand(r_h, val2, val3); + v_store_aligned(sum_r + j , v_muladd(v_cvt_f32(v_reinterpret_as_s32(val0)), w0, vx_load_aligned(sum_r + j))); + v_store_aligned(sum_r + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val1)), w1, vx_load_aligned(sum_r + j + v_float32::nlanes))); + v_store_aligned(sum_r + j + 2*v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val2)), w2, vx_load_aligned(sum_r + j + 2*v_float32::nlanes))); + v_store_aligned(sum_r + j + 3*v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val3)), w3, vx_load_aligned(sum_r + j + 3*v_float32::nlanes))); + } +#endif + for(; j < size.width; j++, ksptr += 3, rsptr += 3) + { + int b = ksptr[0], g = ksptr[1], r = ksptr[2]; + float w = space_weight[k]*color_weight[std::abs(b - rsptr[0]) + std::abs(g - rsptr[1]) + std::abs(r - rsptr[2])]; + wsum[j] += w; + sum_b[j] += b*w; sum_g[j] += g*w; sum_r[j] += r*w; + } + } + j = 0; +#if CV_SIMD + v_float32 v_one = vx_setall_f32(1.f); + for(; j <= size.width - v_uint8::nlanes; j += v_uint8::nlanes, dptr += 3*v_uint8::nlanes) + { + v_float32 w0 = v_one / vx_load_aligned(wsum + j); + v_float32 w1 = v_one / vx_load_aligned(wsum + j + v_float32::nlanes); + v_float32 w2 = v_one / vx_load_aligned(wsum + j + 2*v_float32::nlanes); + v_float32 w3 = v_one / vx_load_aligned(wsum + j + 3*v_float32::nlanes); + + v_store_interleave(dptr, v_pack_u(v_pack(v_round(w0 * vx_load_aligned(sum_b + j)), + v_round(w1 * vx_load_aligned(sum_b + j + v_float32::nlanes))), + v_pack(v_round(w2 * vx_load_aligned(sum_b + j + 2*v_float32::nlanes)), + v_round(w3 * vx_load_aligned(sum_b + j + 3*v_float32::nlanes)))), + v_pack_u(v_pack(v_round(w0 * vx_load_aligned(sum_g + j)), + v_round(w1 * vx_load_aligned(sum_g + j + v_float32::nlanes))), + v_pack(v_round(w2 * vx_load_aligned(sum_g + j + 2*v_float32::nlanes)), + v_round(w3 * vx_load_aligned(sum_g + j + 3*v_float32::nlanes)))), + v_pack_u(v_pack(v_round(w0 * vx_load_aligned(sum_r + j)), + v_round(w1 * vx_load_aligned(sum_r + j + v_float32::nlanes))), + v_pack(v_round(w2 * vx_load_aligned(sum_r + j + 2*v_float32::nlanes)), + v_round(w3 * vx_load_aligned(sum_r + j + 3*v_float32::nlanes))))); + } +#endif + for(; j < size.width; j++) + { + CV_DbgAssert(fabs(wsum[j]) > 0); + wsum[j] = 1.f/wsum[j]; + *(dptr++) = (uchar)cvRound(sum_b[j]*wsum[j]); + *(dptr++) = (uchar)cvRound(sum_g[j]*wsum[j]); + *(dptr++) = (uchar)cvRound(sum_r[j]*wsum[j]); + } + } + } +#if CV_SIMD + vx_cleanup(); +#endif + } + +private: + const Mat *temp; + Mat *dest; + int radius, maxk, *space_ofs; + float *space_weight, *color_weight; +}; + +#ifdef HAVE_OPENCL + +static bool ocl_bilateralFilter_8u(InputArray _src, OutputArray _dst, int d, + double sigma_color, double sigma_space, + int borderType) +{ +#ifdef __ANDROID__ + if (ocl::Device::getDefault().isNVidia()) + return false; +#endif + + int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); + int i, j, maxk, radius; + + if (depth != CV_8U || cn > 4) + return false; + + if (sigma_color <= 0) + sigma_color = 1; + if (sigma_space <= 0) + sigma_space = 1; + + double gauss_color_coeff = -0.5 / (sigma_color * sigma_color); + double gauss_space_coeff = -0.5 / (sigma_space * sigma_space); + + if ( d <= 0 ) + radius = cvRound(sigma_space * 1.5); + else + radius = d / 2; + radius = MAX(radius, 1); + d = radius * 2 + 1; + + UMat src = _src.getUMat(), dst = _dst.getUMat(), temp; + if (src.u == dst.u) + return false; + + copyMakeBorder(src, temp, radius, radius, radius, radius, borderType); + std::vector _space_weight(d * d); + std::vector _space_ofs(d * d); + float * const space_weight = &_space_weight[0]; + int * const space_ofs = &_space_ofs[0]; + + // initialize space-related bilateral filter coefficients + for( i = -radius, maxk = 0; i <= radius; i++ ) + for( j = -radius; j <= radius; j++ ) + { + double r = std::sqrt((double)i * i + (double)j * j); + if ( r > radius ) + continue; + space_weight[maxk] = (float)std::exp(r * r * gauss_space_coeff); + space_ofs[maxk++] = (int)(i * temp.step + j * cn); + } + + char cvt[3][40]; + String cnstr = cn > 1 ? format("%d", cn) : ""; + String kernelName("bilateral"); + size_t sizeDiv = 1; + if ((ocl::Device::getDefault().isIntel()) && + (ocl::Device::getDefault().type() == ocl::Device::TYPE_GPU)) + { + //Intel GPU + if (dst.cols % 4 == 0 && cn == 1) // For single channel x4 sized images. + { + kernelName = "bilateral_float4"; + sizeDiv = 4; + } + } + ocl::Kernel k(kernelName.c_str(), ocl::imgproc::bilateral_oclsrc, + format("-D radius=%d -D maxk=%d -D cn=%d -D int_t=%s -D uint_t=uint%s -D convert_int_t=%s" + " -D uchar_t=%s -D float_t=%s -D convert_float_t=%s -D convert_uchar_t=%s -D gauss_color_coeff=(float)%f", + radius, maxk, cn, ocl::typeToStr(CV_32SC(cn)), cnstr.c_str(), + ocl::convertTypeStr(CV_8U, CV_32S, cn, cvt[0]), + ocl::typeToStr(type), ocl::typeToStr(CV_32FC(cn)), + ocl::convertTypeStr(CV_32S, CV_32F, cn, cvt[1]), + ocl::convertTypeStr(CV_32F, CV_8U, cn, cvt[2]), gauss_color_coeff)); + if (k.empty()) + return false; + + Mat mspace_weight(1, d * d, CV_32FC1, space_weight); + Mat mspace_ofs(1, d * d, CV_32SC1, space_ofs); + UMat ucolor_weight, uspace_weight, uspace_ofs; + + mspace_weight.copyTo(uspace_weight); + mspace_ofs.copyTo(uspace_ofs); + + k.args(ocl::KernelArg::ReadOnlyNoSize(temp), ocl::KernelArg::WriteOnly(dst), + ocl::KernelArg::PtrReadOnly(uspace_weight), + ocl::KernelArg::PtrReadOnly(uspace_ofs)); + + size_t globalsize[2] = { (size_t)dst.cols / sizeDiv, (size_t)dst.rows }; + return k.run(2, globalsize, NULL, false); +} + +#endif +static void +bilateralFilter_8u( const Mat& src, Mat& dst, int d, + double sigma_color, double sigma_space, + int borderType ) +{ + int cn = src.channels(); + int i, j, maxk, radius; + Size size = src.size(); + + CV_Assert( (src.type() == CV_8UC1 || src.type() == CV_8UC3) && src.data != dst.data ); + + if( sigma_color <= 0 ) + sigma_color = 1; + if( sigma_space <= 0 ) + sigma_space = 1; + + double gauss_color_coeff = -0.5/(sigma_color*sigma_color); + double gauss_space_coeff = -0.5/(sigma_space*sigma_space); + + if( d <= 0 ) + radius = cvRound(sigma_space*1.5); + else + radius = d/2; + radius = MAX(radius, 1); + d = radius*2 + 1; + + Mat temp; + copyMakeBorder( src, temp, radius, radius, radius, radius, borderType ); + + std::vector _color_weight(cn*256); + std::vector _space_weight(d*d); + std::vector _space_ofs(d*d); + float* color_weight = &_color_weight[0]; + float* space_weight = &_space_weight[0]; + int* space_ofs = &_space_ofs[0]; + + // initialize color-related bilateral filter coefficients + + for( i = 0; i < 256*cn; i++ ) + color_weight[i] = (float)std::exp(i*i*gauss_color_coeff); + + // initialize space-related bilateral filter coefficients + for( i = -radius, maxk = 0; i <= radius; i++ ) + { + j = -radius; + + for( ; j <= radius; j++ ) + { + double r = std::sqrt((double)i*i + (double)j*j); + if( r > radius ) + continue; + space_weight[maxk] = (float)std::exp(r*r*gauss_space_coeff); + space_ofs[maxk++] = (int)(i*temp.step + j*cn); + } + } + + BilateralFilter_8u_Invoker body(dst, temp, radius, maxk, space_ofs, space_weight, color_weight); + parallel_for_(Range(0, size.height), body, dst.total()/(double)(1<<16)); +} + + +class BilateralFilter_32f_Invoker : + public ParallelLoopBody +{ +public: + + BilateralFilter_32f_Invoker(int _cn, int _radius, int _maxk, int *_space_ofs, + const Mat& _temp, Mat& _dest, float _scale_index, float *_space_weight, float *_expLUT) : + cn(_cn), radius(_radius), maxk(_maxk), space_ofs(_space_ofs), + temp(&_temp), dest(&_dest), scale_index(_scale_index), space_weight(_space_weight), expLUT(_expLUT) + { + } + + virtual void operator() (const Range& range) const CV_OVERRIDE + { + int i, j, k; + Size size = dest->size(); + + for( i = range.start; i < range.end; i++ ) + { + const float* sptr = temp->ptr(i+radius) + radius*cn; + float* dptr = dest->ptr(i); + + if( cn == 1 ) + { + AutoBuffer buf(alignSize(size.width, CV_SIMD_WIDTH) + size.width + CV_SIMD_WIDTH - 1); + memset(buf.data(), 0, buf.size() * sizeof(float)); + float *sum = alignPtr(buf.data(), CV_SIMD_WIDTH); + float *wsum = sum + alignSize(size.width, CV_SIMD_WIDTH); +#if CV_SIMD + v_float32 v_one = vx_setall_f32(1.f); + v_float32 sindex = vx_setall_f32(scale_index); +#endif + for( k = 0; k < maxk; k++ ) + { + const float* ksptr = sptr + space_ofs[k]; + j = 0; +#if CV_SIMD + v_float32 kweight = vx_setall_f32(space_weight[k]); + for (; j <= size.width - v_float32::nlanes; j += v_float32::nlanes) + { + v_float32 val = vx_load(ksptr + j); + + v_float32 alpha = v_absdiff(val, vx_load(sptr + j)) * sindex; + v_int32 idx = v_trunc(alpha); + alpha -= v_cvt_f32(idx); + + v_float32 w = kweight * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one-alpha)); + v_store_aligned(wsum + j, vx_load_aligned(wsum + j) + w); + v_store_aligned(sum + j, v_muladd(val, w, vx_load_aligned(sum + j))); + } +#endif + for (; j < size.width; j++) + { + float val = ksptr[j]; + float alpha = std::abs(val - sptr[j]) * scale_index; + int idx = cvFloor(alpha); + alpha -= idx; + float w = space_weight[k] * (expLUT[idx] + alpha*(expLUT[idx+1] - expLUT[idx])); + wsum[j] += w; + sum[j] += val * w; + } + } + j = 0; +#if CV_SIMD + for (; j <= size.width - v_float32::nlanes; j += v_float32::nlanes) + v_store(dptr + j, vx_load_aligned(sum + j) / vx_load_aligned(wsum + j)); +#endif + for (; j < size.width; j++) + { + CV_DbgAssert(fabs(wsum[j]) > 0); + dptr[j] = sum[j] / wsum[j]; + } + } + else + { + CV_Assert( cn == 3 ); + AutoBuffer buf(alignSize(size.width, CV_SIMD_WIDTH)*3 + size.width + CV_SIMD_WIDTH - 1); + memset(buf.data(), 0, buf.size() * sizeof(float)); + float *sum_b = alignPtr(buf.data(), CV_SIMD_WIDTH); + float *sum_g = sum_b + alignSize(size.width, CV_SIMD_WIDTH); + float *sum_r = sum_g + alignSize(size.width, CV_SIMD_WIDTH); + float *wsum = sum_r + alignSize(size.width, CV_SIMD_WIDTH); +#if CV_SIMD + v_float32 v_one = vx_setall_f32(1.f); + v_float32 sindex = vx_setall_f32(scale_index); +#endif + for (k = 0; k < maxk; k++) + { + const float* ksptr = sptr + space_ofs[k]; + const float* rsptr = sptr; + j = 0; +#if CV_SIMD + v_float32 kweight = vx_setall_f32(space_weight[k]); + for (; j <= size.width - v_float32::nlanes; j += v_float32::nlanes, ksptr += 3*v_float32::nlanes, rsptr += 3*v_float32::nlanes) + { + v_float32 kb, kg, kr, rb, rg, rr; + v_load_deinterleave(ksptr, kb, kg, kr); + v_load_deinterleave(rsptr, rb, rg, rr); + + v_float32 alpha = (v_absdiff(kb, rb) + v_absdiff(kg, rg) + v_absdiff(kr, rr)) * sindex; + v_int32 idx = v_trunc(alpha); + alpha -= v_cvt_f32(idx); + + v_float32 w = kweight * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one - alpha)); + v_store_aligned(wsum + j, vx_load_aligned(wsum + j) + w); + v_store_aligned(sum_b + j, v_muladd(kb, w, vx_load_aligned(sum_b + j))); + v_store_aligned(sum_g + j, v_muladd(kg, w, vx_load_aligned(sum_g + j))); + v_store_aligned(sum_r + j, v_muladd(kr, w, vx_load_aligned(sum_r + j))); + } +#endif + for (; j < size.width; j++, ksptr += 3, rsptr += 3) + { + float b = ksptr[0], g = ksptr[1], r = ksptr[2]; + float alpha = (std::abs(b - rsptr[0]) + std::abs(g - rsptr[1]) + std::abs(r - rsptr[2])) * scale_index; + int idx = cvFloor(alpha); + alpha -= idx; + float w = space_weight[k] * (expLUT[idx] + alpha*(expLUT[idx + 1] - expLUT[idx])); + wsum[j] += w; + sum_b[j] += b*w; + sum_g[j] += g*w; + sum_r[j] += r*w; + } + } + j = 0; +#if CV_SIMD + for (; j <= size.width - v_float32::nlanes; j += v_float32::nlanes, dptr += 3*v_float32::nlanes) + { + v_float32 w = v_one / vx_load_aligned(wsum + j); + v_store_interleave(dptr, vx_load_aligned(sum_b + j) * w, vx_load_aligned(sum_g + j) * w, vx_load_aligned(sum_r + j) * w); + } +#endif + for (; j < size.width; j++) + { + CV_DbgAssert(fabs(wsum[j]) > 0); + wsum[j] = 1.f / wsum[j]; + *(dptr++) = sum_b[j] * wsum[j]; + *(dptr++) = sum_g[j] * wsum[j]; + *(dptr++) = sum_r[j] * wsum[j]; + } + } + } +#if CV_SIMD + vx_cleanup(); +#endif + } + +private: + int cn, radius, maxk, *space_ofs; + const Mat* temp; + Mat *dest; + float scale_index, *space_weight, *expLUT; +}; + + +static void +bilateralFilter_32f( const Mat& src, Mat& dst, int d, + double sigma_color, double sigma_space, + int borderType ) +{ + int cn = src.channels(); + int i, j, maxk, radius; + double minValSrc=-1, maxValSrc=1; + const int kExpNumBinsPerChannel = 1 << 12; + int kExpNumBins = 0; + float lastExpVal = 1.f; + float len, scale_index; + Size size = src.size(); + + CV_Assert( (src.type() == CV_32FC1 || src.type() == CV_32FC3) && src.data != dst.data ); + + if( sigma_color <= 0 ) + sigma_color = 1; + if( sigma_space <= 0 ) + sigma_space = 1; + + double gauss_color_coeff = -0.5/(sigma_color*sigma_color); + double gauss_space_coeff = -0.5/(sigma_space*sigma_space); + + if( d <= 0 ) + radius = cvRound(sigma_space*1.5); + else + radius = d/2; + radius = MAX(radius, 1); + d = radius*2 + 1; + // compute the min/max range for the input image (even if multichannel) + + minMaxLoc( src.reshape(1), &minValSrc, &maxValSrc ); + if(std::abs(minValSrc - maxValSrc) < FLT_EPSILON) + { + src.copyTo(dst); + return; + } + + // temporary copy of the image with borders for easy processing + Mat temp; + copyMakeBorder( src, temp, radius, radius, radius, radius, borderType ); + minValSrc -= 5. * sigma_color; + patchNaNs( temp, minValSrc ); // this replacement of NaNs makes the assumption that depth values are nonnegative + // TODO: make replacement parameter avalible in the outside function interface + // allocate lookup tables + std::vector _space_weight(d*d); + std::vector _space_ofs(d*d); + float* space_weight = &_space_weight[0]; + int* space_ofs = &_space_ofs[0]; + + // assign a length which is slightly more than needed + len = (float)(maxValSrc - minValSrc) * cn; + kExpNumBins = kExpNumBinsPerChannel * cn; + std::vector _expLUT(kExpNumBins+2); + float* expLUT = &_expLUT[0]; + + scale_index = kExpNumBins/len; + + // initialize the exp LUT + for( i = 0; i < kExpNumBins+2; i++ ) + { + if( lastExpVal > 0.f ) + { + double val = i / scale_index; + expLUT[i] = (float)std::exp(val * val * gauss_color_coeff); + lastExpVal = expLUT[i]; + } + else + expLUT[i] = 0.f; + } + + // initialize space-related bilateral filter coefficients + for( i = -radius, maxk = 0; i <= radius; i++ ) + for( j = -radius; j <= radius; j++ ) + { + double r = std::sqrt((double)i*i + (double)j*j); + if( r > radius ) + continue; + space_weight[maxk] = (float)std::exp(r*r*gauss_space_coeff); + space_ofs[maxk++] = (int)(i*(temp.step/sizeof(float)) + j*cn); + } + + // parallel_for usage + + BilateralFilter_32f_Invoker body(cn, radius, maxk, space_ofs, temp, dst, scale_index, space_weight, expLUT); + parallel_for_(Range(0, size.height), body, dst.total()/(double)(1<<16)); +} + +#ifdef HAVE_IPP +#define IPP_BILATERAL_PARALLEL 1 + +#ifdef HAVE_IPP_IW +class ipp_bilateralFilterParallel: public ParallelLoopBody +{ +public: + ipp_bilateralFilterParallel(::ipp::IwiImage &_src, ::ipp::IwiImage &_dst, int _radius, Ipp32f _valSquareSigma, Ipp32f _posSquareSigma, ::ipp::IwiBorderType _borderType, bool *_ok): + src(_src), dst(_dst) + { + pOk = _ok; + + radius = _radius; + valSquareSigma = _valSquareSigma; + posSquareSigma = _posSquareSigma; + borderType = _borderType; + + *pOk = true; + } + ~ipp_bilateralFilterParallel() {} + + virtual void operator() (const Range& range) const CV_OVERRIDE + { + if(*pOk == false) + return; + + try + { + ::ipp::IwiTile tile = ::ipp::IwiRoi(0, range.start, dst.m_size.width, range.end - range.start); + CV_INSTRUMENT_FUN_IPP(::ipp::iwiFilterBilateral, src, dst, radius, valSquareSigma, posSquareSigma, ::ipp::IwDefault(), borderType, tile); + } + catch(const ::ipp::IwException &) + { + *pOk = false; + return; + } + } +private: + ::ipp::IwiImage &src; + ::ipp::IwiImage &dst; + + int radius; + Ipp32f valSquareSigma; + Ipp32f posSquareSigma; + ::ipp::IwiBorderType borderType; + + bool *pOk; + const ipp_bilateralFilterParallel& operator= (const ipp_bilateralFilterParallel&); +}; +#endif + +static bool ipp_bilateralFilter(Mat &src, Mat &dst, int d, double sigmaColor, double sigmaSpace, int borderType) +{ +#ifdef HAVE_IPP_IW + CV_INSTRUMENT_REGION_IPP(); + + int radius = IPP_MAX(((d <= 0)?cvRound(sigmaSpace*1.5):d/2), 1); + Ipp32f valSquareSigma = (Ipp32f)((sigmaColor <= 0)?1:sigmaColor*sigmaColor); + Ipp32f posSquareSigma = (Ipp32f)((sigmaSpace <= 0)?1:sigmaSpace*sigmaSpace); + + // Acquire data and begin processing + try + { + ::ipp::IwiImage iwSrc = ippiGetImage(src); + ::ipp::IwiImage iwDst = ippiGetImage(dst); + ::ipp::IwiBorderSize borderSize(radius); + ::ipp::IwiBorderType ippBorder(ippiGetBorder(iwSrc, borderType, borderSize)); + if(!ippBorder) + return false; + + const int threads = ippiSuggestThreadsNum(iwDst, 2); + if(IPP_BILATERAL_PARALLEL && threads > 1) { + bool ok = true; + Range range(0, (int)iwDst.m_size.height); + ipp_bilateralFilterParallel invoker(iwSrc, iwDst, radius, valSquareSigma, posSquareSigma, ippBorder, &ok); + if(!ok) + return false; + + parallel_for_(range, invoker, threads*4); + + if(!ok) + return false; + } else { + CV_INSTRUMENT_FUN_IPP(::ipp::iwiFilterBilateral, iwSrc, iwDst, radius, valSquareSigma, posSquareSigma, ::ipp::IwDefault(), ippBorder); + } + } + catch (const ::ipp::IwException &) + { + return false; + } + return true; +#else + CV_UNUSED(src); CV_UNUSED(dst); CV_UNUSED(d); CV_UNUSED(sigmaColor); CV_UNUSED(sigmaSpace); CV_UNUSED(borderType); + return false; +#endif +} +#endif + +} + +void cv::bilateralFilter( InputArray _src, OutputArray _dst, int d, + double sigmaColor, double sigmaSpace, + int borderType ) +{ + CV_INSTRUMENT_REGION(); + + _dst.create( _src.size(), _src.type() ); + + CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat(), + ocl_bilateralFilter_8u(_src, _dst, d, sigmaColor, sigmaSpace, borderType)) + + Mat src = _src.getMat(), dst = _dst.getMat(); + + CV_IPP_RUN_FAST(ipp_bilateralFilter(src, dst, d, sigmaColor, sigmaSpace, borderType)); + + if( src.depth() == CV_8U ) + bilateralFilter_8u( src, dst, d, sigmaColor, sigmaSpace, borderType ); + else if( src.depth() == CV_32F ) + bilateralFilter_32f( src, dst, d, sigmaColor, sigmaSpace, borderType ); + else + CV_Error( CV_StsUnsupportedFormat, + "Bilateral filtering is only implemented for 8u and 32f images" ); +} + +/* End of file. */ diff --git a/modules/imgproc/src/smooth.cpp b/modules/imgproc/src/smooth.cpp index e2fe6a58d5..05b2db5b46 100644 --- a/modules/imgproc/src/smooth.cpp +++ b/modules/imgproc/src/smooth.cpp @@ -2505,714 +2505,6 @@ void cv::GaussianBlur( InputArray _src, OutputArray _dst, Size ksize, sepFilter2D(src, dst, sdepth, kx, ky, Point(-1, -1), 0, borderType); } -/****************************************************************************************\ - Bilateral Filtering -\****************************************************************************************/ - -namespace cv -{ - -class BilateralFilter_8u_Invoker : - public ParallelLoopBody -{ -public: - BilateralFilter_8u_Invoker(Mat& _dest, const Mat& _temp, int _radius, int _maxk, - int* _space_ofs, float *_space_weight, float *_color_weight) : - temp(&_temp), dest(&_dest), radius(_radius), - maxk(_maxk), space_ofs(_space_ofs), space_weight(_space_weight), color_weight(_color_weight) - { - } - - virtual void operator() (const Range& range) const CV_OVERRIDE - { - int i, j, cn = dest->channels(), k; - Size size = dest->size(); - - for( i = range.start; i < range.end; i++ ) - { - const uchar* sptr = temp->ptr(i+radius) + radius*cn; - uchar* dptr = dest->ptr(i); - - if( cn == 1 ) - { - AutoBuffer buf(alignSize(size.width, CV_SIMD_WIDTH) + size.width + CV_SIMD_WIDTH - 1); - memset(buf.data(), 0, buf.size() * sizeof(float)); - float *sum = alignPtr(buf.data(), CV_SIMD_WIDTH); - float *wsum = sum + alignSize(size.width, CV_SIMD_WIDTH); - for( k = 0; k < maxk; k++ ) - { - const uchar* ksptr = sptr + space_ofs[k]; - j = 0; -#if CV_SIMD - v_float32 kweight = vx_setall_f32(space_weight[k]); - for (; j <= size.width - v_float32::nlanes; j += v_float32::nlanes) - { - v_uint32 val = vx_load_expand_q(ksptr + j); - v_float32 w = kweight * v_lut(color_weight, v_reinterpret_as_s32(v_absdiff(val, vx_load_expand_q(sptr + j)))); - v_store_aligned(wsum + j, vx_load_aligned(wsum + j) + w); - v_store_aligned(sum + j, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val)), w, vx_load_aligned(sum + j))); - } -#endif - for (; j < size.width; j++) - { - int val = ksptr[j]; - float w = space_weight[k] * color_weight[std::abs(val - sptr[j])]; - wsum[j] += w; - sum[j] += val * w; - } - } - j = 0; -#if CV_SIMD - for (; j <= size.width - 2*v_float32::nlanes; j += 2*v_float32::nlanes) - v_pack_u_store(dptr + j, v_pack(v_round(vx_load_aligned(sum + j ) / vx_load_aligned(wsum + j )), - v_round(vx_load_aligned(sum + j + v_float32::nlanes) / vx_load_aligned(wsum + j + v_float32::nlanes)))); -#endif - for (; j < size.width; j++) - { - // overflow is not possible here => there is no need to use cv::saturate_cast - CV_DbgAssert(fabs(wsum[j]) > 0); - dptr[j] = (uchar)cvRound(sum[j]/wsum[j]); - } - } - else - { - assert( cn == 3 ); - AutoBuffer buf(alignSize(size.width, CV_SIMD_WIDTH)*3 + size.width + CV_SIMD_WIDTH - 1); - memset(buf.data(), 0, buf.size() * sizeof(float)); - float *sum_b = alignPtr(buf.data(), CV_SIMD_WIDTH); - float *sum_g = sum_b + alignSize(size.width, CV_SIMD_WIDTH); - float *sum_r = sum_g + alignSize(size.width, CV_SIMD_WIDTH); - float *wsum = sum_r + alignSize(size.width, CV_SIMD_WIDTH); - for(k = 0; k < maxk; k++ ) - { - const uchar* ksptr = sptr + space_ofs[k]; - const uchar* rsptr = sptr; - j = 0; -#if CV_SIMD - v_float32 kweight = vx_setall_f32(space_weight[k]); - for (; j <= size.width - v_uint8::nlanes; j += v_uint8::nlanes, ksptr += 3*v_uint8::nlanes, rsptr += 3*v_uint8::nlanes) - { - v_uint8 kb, kg, kr, rb, rg, rr; - v_load_deinterleave(ksptr, kb, kg, kr); - v_load_deinterleave(rsptr, rb, rg, rr); - - v_uint16 b_l, b_h, g_l, g_h, r_l, r_h; - v_expand(v_absdiff(kb, rb), b_l, b_h); - v_expand(v_absdiff(kg, rg), g_l, g_h); - v_expand(v_absdiff(kr, rr), r_l, r_h); - - v_uint32 val0, val1, val2, val3; - v_expand(b_l + g_l + r_l, val0, val1); - v_expand(b_h + g_h + r_h, val2, val3); - - v_expand(kb, b_l, b_h); - v_expand(kg, g_l, g_h); - v_expand(kr, r_l, r_h); - - v_float32 w0 = kweight * v_lut(color_weight, v_reinterpret_as_s32(val0)); - v_float32 w1 = kweight * v_lut(color_weight, v_reinterpret_as_s32(val1)); - v_float32 w2 = kweight * v_lut(color_weight, v_reinterpret_as_s32(val2)); - v_float32 w3 = kweight * v_lut(color_weight, v_reinterpret_as_s32(val3)); - v_store_aligned(wsum + j , w0 + vx_load_aligned(wsum + j)); - v_store_aligned(wsum + j + v_float32::nlanes, w1 + vx_load_aligned(wsum + j + v_float32::nlanes)); - v_store_aligned(wsum + j + 2*v_float32::nlanes, w2 + vx_load_aligned(wsum + j + 2*v_float32::nlanes)); - v_store_aligned(wsum + j + 3*v_float32::nlanes, w3 + vx_load_aligned(wsum + j + 3*v_float32::nlanes)); - v_expand(b_l, val0, val1); - v_expand(b_h, val2, val3); - v_store_aligned(sum_b + j , v_muladd(v_cvt_f32(v_reinterpret_as_s32(val0)), w0, vx_load_aligned(sum_b + j))); - v_store_aligned(sum_b + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val1)), w1, vx_load_aligned(sum_b + j + v_float32::nlanes))); - v_store_aligned(sum_b + j + 2*v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val2)), w2, vx_load_aligned(sum_b + j + 2*v_float32::nlanes))); - v_store_aligned(sum_b + j + 3*v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val3)), w3, vx_load_aligned(sum_b + j + 3*v_float32::nlanes))); - v_expand(g_l, val0, val1); - v_expand(g_h, val2, val3); - v_store_aligned(sum_g + j , v_muladd(v_cvt_f32(v_reinterpret_as_s32(val0)), w0, vx_load_aligned(sum_g + j))); - v_store_aligned(sum_g + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val1)), w1, vx_load_aligned(sum_g + j + v_float32::nlanes))); - v_store_aligned(sum_g + j + 2*v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val2)), w2, vx_load_aligned(sum_g + j + 2*v_float32::nlanes))); - v_store_aligned(sum_g + j + 3*v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val3)), w3, vx_load_aligned(sum_g + j + 3*v_float32::nlanes))); - v_expand(r_l, val0, val1); - v_expand(r_h, val2, val3); - v_store_aligned(sum_r + j , v_muladd(v_cvt_f32(v_reinterpret_as_s32(val0)), w0, vx_load_aligned(sum_r + j))); - v_store_aligned(sum_r + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val1)), w1, vx_load_aligned(sum_r + j + v_float32::nlanes))); - v_store_aligned(sum_r + j + 2*v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val2)), w2, vx_load_aligned(sum_r + j + 2*v_float32::nlanes))); - v_store_aligned(sum_r + j + 3*v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val3)), w3, vx_load_aligned(sum_r + j + 3*v_float32::nlanes))); - } -#endif - for(; j < size.width; j++, ksptr += 3, rsptr += 3) - { - int b = ksptr[0], g = ksptr[1], r = ksptr[2]; - float w = space_weight[k]*color_weight[std::abs(b - rsptr[0]) + std::abs(g - rsptr[1]) + std::abs(r - rsptr[2])]; - wsum[j] += w; - sum_b[j] += b*w; sum_g[j] += g*w; sum_r[j] += r*w; - } - } - j = 0; -#if CV_SIMD - v_float32 v_one = vx_setall_f32(1.f); - for(; j <= size.width - v_uint8::nlanes; j += v_uint8::nlanes, dptr += 3*v_uint8::nlanes) - { - v_float32 w0 = v_one / vx_load_aligned(wsum + j); - v_float32 w1 = v_one / vx_load_aligned(wsum + j + v_float32::nlanes); - v_float32 w2 = v_one / vx_load_aligned(wsum + j + 2*v_float32::nlanes); - v_float32 w3 = v_one / vx_load_aligned(wsum + j + 3*v_float32::nlanes); - - v_store_interleave(dptr, v_pack_u(v_pack(v_round(w0 * vx_load_aligned(sum_b + j)), - v_round(w1 * vx_load_aligned(sum_b + j + v_float32::nlanes))), - v_pack(v_round(w2 * vx_load_aligned(sum_b + j + 2*v_float32::nlanes)), - v_round(w3 * vx_load_aligned(sum_b + j + 3*v_float32::nlanes)))), - v_pack_u(v_pack(v_round(w0 * vx_load_aligned(sum_g + j)), - v_round(w1 * vx_load_aligned(sum_g + j + v_float32::nlanes))), - v_pack(v_round(w2 * vx_load_aligned(sum_g + j + 2*v_float32::nlanes)), - v_round(w3 * vx_load_aligned(sum_g + j + 3*v_float32::nlanes)))), - v_pack_u(v_pack(v_round(w0 * vx_load_aligned(sum_r + j)), - v_round(w1 * vx_load_aligned(sum_r + j + v_float32::nlanes))), - v_pack(v_round(w2 * vx_load_aligned(sum_r + j + 2*v_float32::nlanes)), - v_round(w3 * vx_load_aligned(sum_r + j + 3*v_float32::nlanes))))); - } -#endif - for(; j < size.width; j++) - { - CV_DbgAssert(fabs(wsum[j]) > 0); - wsum[j] = 1.f/wsum[j]; - *(dptr++) = (uchar)cvRound(sum_b[j]*wsum[j]); - *(dptr++) = (uchar)cvRound(sum_g[j]*wsum[j]); - *(dptr++) = (uchar)cvRound(sum_r[j]*wsum[j]); - } - } - } -#if CV_SIMD - vx_cleanup(); -#endif - } - -private: - const Mat *temp; - Mat *dest; - int radius, maxk, *space_ofs; - float *space_weight, *color_weight; -}; - -#ifdef HAVE_OPENCL - -static bool ocl_bilateralFilter_8u(InputArray _src, OutputArray _dst, int d, - double sigma_color, double sigma_space, - int borderType) -{ -#ifdef __ANDROID__ - if (ocl::Device::getDefault().isNVidia()) - return false; -#endif - - int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); - int i, j, maxk, radius; - - if (depth != CV_8U || cn > 4) - return false; - - if (sigma_color <= 0) - sigma_color = 1; - if (sigma_space <= 0) - sigma_space = 1; - - double gauss_color_coeff = -0.5 / (sigma_color * sigma_color); - double gauss_space_coeff = -0.5 / (sigma_space * sigma_space); - - if ( d <= 0 ) - radius = cvRound(sigma_space * 1.5); - else - radius = d / 2; - radius = MAX(radius, 1); - d = radius * 2 + 1; - - UMat src = _src.getUMat(), dst = _dst.getUMat(), temp; - if (src.u == dst.u) - return false; - - copyMakeBorder(src, temp, radius, radius, radius, radius, borderType); - std::vector _space_weight(d * d); - std::vector _space_ofs(d * d); - float * const space_weight = &_space_weight[0]; - int * const space_ofs = &_space_ofs[0]; - - // initialize space-related bilateral filter coefficients - for( i = -radius, maxk = 0; i <= radius; i++ ) - for( j = -radius; j <= radius; j++ ) - { - double r = std::sqrt((double)i * i + (double)j * j); - if ( r > radius ) - continue; - space_weight[maxk] = (float)std::exp(r * r * gauss_space_coeff); - space_ofs[maxk++] = (int)(i * temp.step + j * cn); - } - - char cvt[3][40]; - String cnstr = cn > 1 ? format("%d", cn) : ""; - String kernelName("bilateral"); - size_t sizeDiv = 1; - if ((ocl::Device::getDefault().isIntel()) && - (ocl::Device::getDefault().type() == ocl::Device::TYPE_GPU)) - { - //Intel GPU - if (dst.cols % 4 == 0 && cn == 1) // For single channel x4 sized images. - { - kernelName = "bilateral_float4"; - sizeDiv = 4; - } - } - ocl::Kernel k(kernelName.c_str(), ocl::imgproc::bilateral_oclsrc, - format("-D radius=%d -D maxk=%d -D cn=%d -D int_t=%s -D uint_t=uint%s -D convert_int_t=%s" - " -D uchar_t=%s -D float_t=%s -D convert_float_t=%s -D convert_uchar_t=%s -D gauss_color_coeff=(float)%f", - radius, maxk, cn, ocl::typeToStr(CV_32SC(cn)), cnstr.c_str(), - ocl::convertTypeStr(CV_8U, CV_32S, cn, cvt[0]), - ocl::typeToStr(type), ocl::typeToStr(CV_32FC(cn)), - ocl::convertTypeStr(CV_32S, CV_32F, cn, cvt[1]), - ocl::convertTypeStr(CV_32F, CV_8U, cn, cvt[2]), gauss_color_coeff)); - if (k.empty()) - return false; - - Mat mspace_weight(1, d * d, CV_32FC1, space_weight); - Mat mspace_ofs(1, d * d, CV_32SC1, space_ofs); - UMat ucolor_weight, uspace_weight, uspace_ofs; - - mspace_weight.copyTo(uspace_weight); - mspace_ofs.copyTo(uspace_ofs); - - k.args(ocl::KernelArg::ReadOnlyNoSize(temp), ocl::KernelArg::WriteOnly(dst), - ocl::KernelArg::PtrReadOnly(uspace_weight), - ocl::KernelArg::PtrReadOnly(uspace_ofs)); - - size_t globalsize[2] = { (size_t)dst.cols / sizeDiv, (size_t)dst.rows }; - return k.run(2, globalsize, NULL, false); -} - -#endif -static void -bilateralFilter_8u( const Mat& src, Mat& dst, int d, - double sigma_color, double sigma_space, - int borderType ) -{ - int cn = src.channels(); - int i, j, maxk, radius; - Size size = src.size(); - - CV_Assert( (src.type() == CV_8UC1 || src.type() == CV_8UC3) && src.data != dst.data ); - - if( sigma_color <= 0 ) - sigma_color = 1; - if( sigma_space <= 0 ) - sigma_space = 1; - - double gauss_color_coeff = -0.5/(sigma_color*sigma_color); - double gauss_space_coeff = -0.5/(sigma_space*sigma_space); - - if( d <= 0 ) - radius = cvRound(sigma_space*1.5); - else - radius = d/2; - radius = MAX(radius, 1); - d = radius*2 + 1; - - Mat temp; - copyMakeBorder( src, temp, radius, radius, radius, radius, borderType ); - - std::vector _color_weight(cn*256); - std::vector _space_weight(d*d); - std::vector _space_ofs(d*d); - float* color_weight = &_color_weight[0]; - float* space_weight = &_space_weight[0]; - int* space_ofs = &_space_ofs[0]; - - // initialize color-related bilateral filter coefficients - - for( i = 0; i < 256*cn; i++ ) - color_weight[i] = (float)std::exp(i*i*gauss_color_coeff); - - // initialize space-related bilateral filter coefficients - for( i = -radius, maxk = 0; i <= radius; i++ ) - { - j = -radius; - - for( ; j <= radius; j++ ) - { - double r = std::sqrt((double)i*i + (double)j*j); - if( r > radius ) - continue; - space_weight[maxk] = (float)std::exp(r*r*gauss_space_coeff); - space_ofs[maxk++] = (int)(i*temp.step + j*cn); - } - } - - BilateralFilter_8u_Invoker body(dst, temp, radius, maxk, space_ofs, space_weight, color_weight); - parallel_for_(Range(0, size.height), body, dst.total()/(double)(1<<16)); -} - - -class BilateralFilter_32f_Invoker : - public ParallelLoopBody -{ -public: - - BilateralFilter_32f_Invoker(int _cn, int _radius, int _maxk, int *_space_ofs, - const Mat& _temp, Mat& _dest, float _scale_index, float *_space_weight, float *_expLUT) : - cn(_cn), radius(_radius), maxk(_maxk), space_ofs(_space_ofs), - temp(&_temp), dest(&_dest), scale_index(_scale_index), space_weight(_space_weight), expLUT(_expLUT) - { - } - - virtual void operator() (const Range& range) const CV_OVERRIDE - { - int i, j, k; - Size size = dest->size(); - - for( i = range.start; i < range.end; i++ ) - { - const float* sptr = temp->ptr(i+radius) + radius*cn; - float* dptr = dest->ptr(i); - - if( cn == 1 ) - { - AutoBuffer buf(alignSize(size.width, CV_SIMD_WIDTH) + size.width + CV_SIMD_WIDTH - 1); - memset(buf.data(), 0, buf.size() * sizeof(float)); - float *sum = alignPtr(buf.data(), CV_SIMD_WIDTH); - float *wsum = sum + alignSize(size.width, CV_SIMD_WIDTH); -#if CV_SIMD - v_float32 v_one = vx_setall_f32(1.f); - v_float32 sindex = vx_setall_f32(scale_index); -#endif - for( k = 0; k < maxk; k++ ) - { - const float* ksptr = sptr + space_ofs[k]; - j = 0; -#if CV_SIMD - v_float32 kweight = vx_setall_f32(space_weight[k]); - for (; j <= size.width - v_float32::nlanes; j += v_float32::nlanes) - { - v_float32 val = vx_load(ksptr + j); - - v_float32 alpha = v_absdiff(val, vx_load(sptr + j)) * sindex; - v_int32 idx = v_trunc(alpha); - alpha -= v_cvt_f32(idx); - - v_float32 w = kweight * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one-alpha)); - v_store_aligned(wsum + j, vx_load_aligned(wsum + j) + w); - v_store_aligned(sum + j, v_muladd(val, w, vx_load_aligned(sum + j))); - } -#endif - for (; j < size.width; j++) - { - float val = ksptr[j]; - float alpha = std::abs(val - sptr[j]) * scale_index; - int idx = cvFloor(alpha); - alpha -= idx; - float w = space_weight[k] * (expLUT[idx] + alpha*(expLUT[idx+1] - expLUT[idx])); - wsum[j] += w; - sum[j] += val * w; - } - } - j = 0; -#if CV_SIMD - for (; j <= size.width - v_float32::nlanes; j += v_float32::nlanes) - v_store(dptr + j, vx_load_aligned(sum + j) / vx_load_aligned(wsum + j)); -#endif - for (; j < size.width; j++) - { - CV_DbgAssert(fabs(wsum[j]) > 0); - dptr[j] = sum[j] / wsum[j]; - } - } - else - { - CV_Assert( cn == 3 ); - AutoBuffer buf(alignSize(size.width, CV_SIMD_WIDTH)*3 + size.width + CV_SIMD_WIDTH - 1); - memset(buf.data(), 0, buf.size() * sizeof(float)); - float *sum_b = alignPtr(buf.data(), CV_SIMD_WIDTH); - float *sum_g = sum_b + alignSize(size.width, CV_SIMD_WIDTH); - float *sum_r = sum_g + alignSize(size.width, CV_SIMD_WIDTH); - float *wsum = sum_r + alignSize(size.width, CV_SIMD_WIDTH); -#if CV_SIMD - v_float32 v_one = vx_setall_f32(1.f); - v_float32 sindex = vx_setall_f32(scale_index); -#endif - for (k = 0; k < maxk; k++) - { - const float* ksptr = sptr + space_ofs[k]; - const float* rsptr = sptr; - j = 0; -#if CV_SIMD - v_float32 kweight = vx_setall_f32(space_weight[k]); - for (; j <= size.width - v_float32::nlanes; j += v_float32::nlanes, ksptr += 3*v_float32::nlanes, rsptr += 3*v_float32::nlanes) - { - v_float32 kb, kg, kr, rb, rg, rr; - v_load_deinterleave(ksptr, kb, kg, kr); - v_load_deinterleave(rsptr, rb, rg, rr); - - v_float32 alpha = (v_absdiff(kb, rb) + v_absdiff(kg, rg) + v_absdiff(kr, rr)) * sindex; - v_int32 idx = v_trunc(alpha); - alpha -= v_cvt_f32(idx); - - v_float32 w = kweight * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one - alpha)); - v_store_aligned(wsum + j, vx_load_aligned(wsum + j) + w); - v_store_aligned(sum_b + j, v_muladd(kb, w, vx_load_aligned(sum_b + j))); - v_store_aligned(sum_g + j, v_muladd(kg, w, vx_load_aligned(sum_g + j))); - v_store_aligned(sum_r + j, v_muladd(kr, w, vx_load_aligned(sum_r + j))); - } -#endif - for (; j < size.width; j++, ksptr += 3, rsptr += 3) - { - float b = ksptr[0], g = ksptr[1], r = ksptr[2]; - float alpha = (std::abs(b - rsptr[0]) + std::abs(g - rsptr[1]) + std::abs(r - rsptr[2])) * scale_index; - int idx = cvFloor(alpha); - alpha -= idx; - float w = space_weight[k] * (expLUT[idx] + alpha*(expLUT[idx + 1] - expLUT[idx])); - wsum[j] += w; - sum_b[j] += b*w; - sum_g[j] += g*w; - sum_r[j] += r*w; - } - } - j = 0; -#if CV_SIMD - for (; j <= size.width - v_float32::nlanes; j += v_float32::nlanes, dptr += 3*v_float32::nlanes) - { - v_float32 w = v_one / vx_load_aligned(wsum + j); - v_store_interleave(dptr, vx_load_aligned(sum_b + j) * w, vx_load_aligned(sum_g + j) * w, vx_load_aligned(sum_r + j) * w); - } -#endif - for (; j < size.width; j++) - { - CV_DbgAssert(fabs(wsum[j]) > 0); - wsum[j] = 1.f / wsum[j]; - *(dptr++) = sum_b[j] * wsum[j]; - *(dptr++) = sum_g[j] * wsum[j]; - *(dptr++) = sum_r[j] * wsum[j]; - } - } - } -#if CV_SIMD - vx_cleanup(); -#endif - } - -private: - int cn, radius, maxk, *space_ofs; - const Mat* temp; - Mat *dest; - float scale_index, *space_weight, *expLUT; -}; - - -static void -bilateralFilter_32f( const Mat& src, Mat& dst, int d, - double sigma_color, double sigma_space, - int borderType ) -{ - int cn = src.channels(); - int i, j, maxk, radius; - double minValSrc=-1, maxValSrc=1; - const int kExpNumBinsPerChannel = 1 << 12; - int kExpNumBins = 0; - float lastExpVal = 1.f; - float len, scale_index; - Size size = src.size(); - - CV_Assert( (src.type() == CV_32FC1 || src.type() == CV_32FC3) && src.data != dst.data ); - - if( sigma_color <= 0 ) - sigma_color = 1; - if( sigma_space <= 0 ) - sigma_space = 1; - - double gauss_color_coeff = -0.5/(sigma_color*sigma_color); - double gauss_space_coeff = -0.5/(sigma_space*sigma_space); - - if( d <= 0 ) - radius = cvRound(sigma_space*1.5); - else - radius = d/2; - radius = MAX(radius, 1); - d = radius*2 + 1; - // compute the min/max range for the input image (even if multichannel) - - minMaxLoc( src.reshape(1), &minValSrc, &maxValSrc ); - if(std::abs(minValSrc - maxValSrc) < FLT_EPSILON) - { - src.copyTo(dst); - return; - } - - // temporary copy of the image with borders for easy processing - Mat temp; - copyMakeBorder( src, temp, radius, radius, radius, radius, borderType ); - const double insteadNaNValue = -5. * sigma_color; - patchNaNs( temp, insteadNaNValue ); // this replacement of NaNs makes the assumption that depth values are nonnegative - // TODO: make insteadNaNValue avalible in the outside function interface to control the cases breaking the assumption - // allocate lookup tables - std::vector _space_weight(d*d); - std::vector _space_ofs(d*d); - float* space_weight = &_space_weight[0]; - int* space_ofs = &_space_ofs[0]; - - // assign a length which is slightly more than needed - len = (float)(maxValSrc - minValSrc) * cn; - kExpNumBins = kExpNumBinsPerChannel * cn; - std::vector _expLUT(kExpNumBins+2); - float* expLUT = &_expLUT[0]; - - scale_index = kExpNumBins/len; - - // initialize the exp LUT - for( i = 0; i < kExpNumBins+2; i++ ) - { - if( lastExpVal > 0.f ) - { - double val = i / scale_index; - expLUT[i] = (float)std::exp(val * val * gauss_color_coeff); - lastExpVal = expLUT[i]; - } - else - expLUT[i] = 0.f; - } - - // initialize space-related bilateral filter coefficients - for( i = -radius, maxk = 0; i <= radius; i++ ) - for( j = -radius; j <= radius; j++ ) - { - double r = std::sqrt((double)i*i + (double)j*j); - if( r > radius ) - continue; - space_weight[maxk] = (float)std::exp(r*r*gauss_space_coeff); - space_ofs[maxk++] = (int)(i*(temp.step/sizeof(float)) + j*cn); - } - - // parallel_for usage - - BilateralFilter_32f_Invoker body(cn, radius, maxk, space_ofs, temp, dst, scale_index, space_weight, expLUT); - parallel_for_(Range(0, size.height), body, dst.total()/(double)(1<<16)); -} - -#ifdef HAVE_IPP -#define IPP_BILATERAL_PARALLEL 1 - -#ifdef HAVE_IPP_IW -class ipp_bilateralFilterParallel: public ParallelLoopBody -{ -public: - ipp_bilateralFilterParallel(::ipp::IwiImage &_src, ::ipp::IwiImage &_dst, int _radius, Ipp32f _valSquareSigma, Ipp32f _posSquareSigma, ::ipp::IwiBorderType _borderType, bool *_ok): - src(_src), dst(_dst) - { - pOk = _ok; - - radius = _radius; - valSquareSigma = _valSquareSigma; - posSquareSigma = _posSquareSigma; - borderType = _borderType; - - *pOk = true; - } - ~ipp_bilateralFilterParallel() {} - - virtual void operator() (const Range& range) const CV_OVERRIDE - { - if(*pOk == false) - return; - - try - { - ::ipp::IwiTile tile = ::ipp::IwiRoi(0, range.start, dst.m_size.width, range.end - range.start); - CV_INSTRUMENT_FUN_IPP(::ipp::iwiFilterBilateral, src, dst, radius, valSquareSigma, posSquareSigma, ::ipp::IwDefault(), borderType, tile); - } - catch(const ::ipp::IwException &) - { - *pOk = false; - return; - } - } -private: - ::ipp::IwiImage &src; - ::ipp::IwiImage &dst; - - int radius; - Ipp32f valSquareSigma; - Ipp32f posSquareSigma; - ::ipp::IwiBorderType borderType; - - bool *pOk; - const ipp_bilateralFilterParallel& operator= (const ipp_bilateralFilterParallel&); -}; -#endif - -static bool ipp_bilateralFilter(Mat &src, Mat &dst, int d, double sigmaColor, double sigmaSpace, int borderType) -{ -#ifdef HAVE_IPP_IW - CV_INSTRUMENT_REGION_IPP(); - - int radius = IPP_MAX(((d <= 0)?cvRound(sigmaSpace*1.5):d/2), 1); - Ipp32f valSquareSigma = (Ipp32f)((sigmaColor <= 0)?1:sigmaColor*sigmaColor); - Ipp32f posSquareSigma = (Ipp32f)((sigmaSpace <= 0)?1:sigmaSpace*sigmaSpace); - - // Acquire data and begin processing - try - { - ::ipp::IwiImage iwSrc = ippiGetImage(src); - ::ipp::IwiImage iwDst = ippiGetImage(dst); - ::ipp::IwiBorderSize borderSize(radius); - ::ipp::IwiBorderType ippBorder(ippiGetBorder(iwSrc, borderType, borderSize)); - if(!ippBorder) - return false; - - const int threads = ippiSuggestThreadsNum(iwDst, 2); - if(IPP_BILATERAL_PARALLEL && threads > 1) { - bool ok = true; - Range range(0, (int)iwDst.m_size.height); - ipp_bilateralFilterParallel invoker(iwSrc, iwDst, radius, valSquareSigma, posSquareSigma, ippBorder, &ok); - if(!ok) - return false; - - parallel_for_(range, invoker, threads*4); - - if(!ok) - return false; - } else { - CV_INSTRUMENT_FUN_IPP(::ipp::iwiFilterBilateral, iwSrc, iwDst, radius, valSquareSigma, posSquareSigma, ::ipp::IwDefault(), ippBorder); - } - } - catch (const ::ipp::IwException &) - { - return false; - } - return true; -#else - CV_UNUSED(src); CV_UNUSED(dst); CV_UNUSED(d); CV_UNUSED(sigmaColor); CV_UNUSED(sigmaSpace); CV_UNUSED(borderType); - return false; -#endif -} -#endif - -} - -void cv::bilateralFilter( InputArray _src, OutputArray _dst, int d, - double sigmaColor, double sigmaSpace, - int borderType ) -{ - CV_INSTRUMENT_REGION(); - - _dst.create( _src.size(), _src.type() ); - - CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat(), - ocl_bilateralFilter_8u(_src, _dst, d, sigmaColor, sigmaSpace, borderType)) - - Mat src = _src.getMat(), dst = _dst.getMat(); - - CV_IPP_RUN_FAST(ipp_bilateralFilter(src, dst, d, sigmaColor, sigmaSpace, borderType)); - - if( src.depth() == CV_8U ) - bilateralFilter_8u( src, dst, d, sigmaColor, sigmaSpace, borderType ); - else if( src.depth() == CV_32F ) - bilateralFilter_32f( src, dst, d, sigmaColor, sigmaSpace, borderType ); - else - CV_Error( CV_StsUnsupportedFormat, - "Bilateral filtering is only implemented for 8u and 32f images" ); -} - ////////////////////////////////////////////////////////////////////////////////////////// CV_IMPL void