diff --git a/modules/core/include/opencv2/core/hal/intrin_sse.hpp b/modules/core/include/opencv2/core/hal/intrin_sse.hpp index 40821e09fe..60003082ea 100644 --- a/modules/core/include/opencv2/core/hal/intrin_sse.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_sse.hpp @@ -1368,6 +1368,24 @@ OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(v_int32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NO OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(v_float32x4, ps, _mm_castps_si128, _mm_castsi128_ps) // adopted from sse_utils.hpp +inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b) +{ + __m128i t00 = _mm_loadu_si128((const __m128i*)ptr); + __m128i t01 = _mm_loadu_si128((const __m128i*)(ptr + 16)); + + __m128i t10 = _mm_unpacklo_epi8(t00, t01); + __m128i t11 = _mm_unpackhi_epi8(t00, t01); + + __m128i t20 = _mm_unpacklo_epi8(t10, t11); + __m128i t21 = _mm_unpackhi_epi8(t10, t11); + + __m128i t30 = _mm_unpacklo_epi8(t20, t21); + __m128i t31 = _mm_unpackhi_epi8(t20, t21); + + a.val = _mm_unpacklo_epi8(t30, t31); + b.val = _mm_unpackhi_epi8(t30, t31); +} + inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b, v_uint8x16& c) { __m128i t00 = _mm_loadu_si128((const __m128i*)ptr); @@ -1507,6 +1525,15 @@ inline void v_store_interleave( short* ptr, const v_int16x8& a, const v_int16x8& _mm_storeu_si128((__m128i*)(ptr + 8), t1); } +inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b) +{ + __m128i v0 = _mm_unpacklo_epi8(a.val, b.val); + __m128i v1 = _mm_unpackhi_epi8(a.val, b.val); + + _mm_storeu_si128((__m128i*)(ptr), v0); + _mm_storeu_si128((__m128i*)(ptr + 16), v1); +} + inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b, const v_uint8x16& c ) { diff --git a/modules/imgproc/perf/opencl/perf_blend.cpp b/modules/imgproc/perf/opencl/perf_blend.cpp index 6396fef7e2..4feda65144 100644 --- a/modules/imgproc/perf/opencl/perf_blend.cpp +++ b/modules/imgproc/perf/opencl/perf_blend.cpp @@ -56,7 +56,7 @@ namespace ocl { typedef Size_MatType BlendLinearFixture; -OCL_PERF_TEST_P(BlendLinearFixture, BlendLinear, ::testing::Combine(OCL_TEST_SIZES, OCL_PERF_ENUM(CV_32FC1, CV_32FC4))) +OCL_PERF_TEST_P(BlendLinearFixture, BlendLinear, ::testing::Combine(OCL_TEST_SIZES, OCL_PERF_ENUM(CV_32FC1, CV_32FC3, CV_32FC4, CV_8UC1, CV_8UC3, CV_8UC4))) { Size_MatType_t params = GetParam(); const Size srcSize = get<0>(params); diff --git a/modules/imgproc/src/blend.cpp b/modules/imgproc/src/blend.cpp index 17e31aa4ab..bcb5c80f4a 100644 --- a/modules/imgproc/src/blend.cpp +++ b/modules/imgproc/src/blend.cpp @@ -45,8 +45,260 @@ #include "precomp.hpp" #include "opencl_kernels_imgproc.hpp" +#include "opencv2/core/hal/intrin.hpp" namespace cv { +#if CV_SIMD128 +static inline v_float32x4 blend(const v_float32x4& v_src1, const v_float32x4& v_src2, const v_float32x4& v_w1, const v_float32x4& v_w2) +{ + const v_float32x4 v_eps = v_setall_f32(1e-5f); + v_float32x4 v_denom = v_w1 + v_w2 + v_eps; + return (v_src1 * v_w1 + v_src2 * v_w2) / v_denom; +} +static inline v_float32x4 blend(const v_float32x4& v_src1, const v_float32x4& v_src2, const float* w_ptr1, const float* w_ptr2, int offset) +{ + v_float32x4 v_w1 = v_load(w_ptr1 + offset); + v_float32x4 v_w2 = v_load(w_ptr2 + offset); + return blend(v_src1, v_src2, v_w1, v_w2); +} +static inline v_uint32x4 saturate_f32_u32(const v_float32x4& vec) +{ + const v_int32x4 z = v_setzero_s32(); + const v_int32x4 x = v_setall_s32(255); + return v_reinterpret_as_u32(v_min(v_max(v_round(vec), z), x)); +} +static inline v_uint8x16 pack_f32tou8(v_float32x4& val0, v_float32x4& val1, v_float32x4& val2, v_float32x4& val3) +{ + v_uint32x4 a = saturate_f32_u32(val0); + v_uint32x4 b = saturate_f32_u32(val1); + v_uint32x4 c = saturate_f32_u32(val2); + v_uint32x4 d = saturate_f32_u32(val3); + v_uint16x8 e = v_pack(a, b); + v_uint16x8 f = v_pack(c, d); + return v_pack(e, f); +} +static inline void store_pack_f32tou8(uchar* ptr, v_float32x4& val0, v_float32x4& val1, v_float32x4& val2, v_float32x4& val3) +{ + v_store((ptr), pack_f32tou8(val0, val1, val2, val3)); +} +static inline void expand_u8tof32(const v_uint8x16& src, v_float32x4& dst0, v_float32x4& dst1, v_float32x4& dst2, v_float32x4& dst3) +{ + v_uint16x8 a0, a1; + v_expand(src, a0, a1); + v_uint32x4 b0, b1,b2,b3; + v_expand(a0, b0, b1); + v_expand(a1, b2, b3); + dst0 = v_cvt_f32(v_reinterpret_as_s32(b0)); + dst1 = v_cvt_f32(v_reinterpret_as_s32(b1)); + dst2 = v_cvt_f32(v_reinterpret_as_s32(b2)); + dst3 = v_cvt_f32(v_reinterpret_as_s32(b3)); +} +static inline void load_expand_u8tof32(const uchar* ptr, v_float32x4& dst0, v_float32x4& dst1, v_float32x4& dst2, v_float32x4& dst3) +{ + v_uint8x16 a = v_load((ptr)); + expand_u8tof32(a, dst0, dst1, dst2, dst3); +} +int blendLinearSimd128(const uchar* src1, const uchar* src2, const float* weights1, const float* weights2, uchar* dst, int x, int width, int cn) +{ + const v_float32x4 v_eps = v_setall_f32(1e-5f); + int weight_offset = 0; + int step = v_uint8x16::nlanes * cn; + int weight_step = v_uint8x16::nlanes*cn; + switch(cn) + { + case 1: + for( ; x <= width - step; x += step, weight_offset += weight_step) + { + v_float32x4 v_src10, v_src11, v_src12, v_src13; + v_float32x4 v_src20, v_src21, v_src22, v_src23; + load_expand_u8tof32(src1 + x, v_src10, v_src11, v_src12, v_src13); + load_expand_u8tof32(src2 + x, v_src20, v_src21, v_src22, v_src23); + + v_float32x4 v_dst0 = blend(v_src10, v_src20, weights1, weights2, weight_offset); + v_float32x4 v_dst1 = blend(v_src11, v_src21, weights1, weights2, weight_offset + 4); + v_float32x4 v_dst2 = blend(v_src12, v_src22, weights1, weights2, weight_offset + 8); + v_float32x4 v_dst3 = blend(v_src13, v_src23, weights1, weights2, weight_offset + 12); + + store_pack_f32tou8(dst + x, v_dst0, v_dst1, v_dst2, v_dst3); + } + break; + case 2: + for( ; x <= width - step; x += step, weight_offset += weight_step) + { + v_uint8x16 v_src10, v_src11, v_src20, v_src21; + v_load_deinterleave(src1 + x, v_src10, v_src11); + v_load_deinterleave(src2 + x, v_src20, v_src21); + v_float32x4 v_src100, v_src101, v_src102, v_src103, v_src110, v_src111, v_src112, v_src113; + v_float32x4 v_src200, v_src201, v_src202, v_src203, v_src210, v_src211, v_src212, v_src213; + expand_u8tof32(v_src10, v_src100, v_src101, v_src102, v_src103); + expand_u8tof32(v_src11, v_src110, v_src111, v_src112, v_src113); + expand_u8tof32(v_src20, v_src200, v_src201, v_src202, v_src203); + expand_u8tof32(v_src21, v_src210, v_src211, v_src212, v_src213); + + v_float32x4 v_dst0 = blend(v_src100, v_src200, weights1, weights2, weight_offset); + v_float32x4 v_dst1 = blend(v_src110, v_src210, weights1, weights2, weight_offset); + v_float32x4 v_dst2 = blend(v_src101, v_src201, weights1, weights2, weight_offset + 4); + v_float32x4 v_dst3 = blend(v_src111, v_src211, weights1, weights2, weight_offset + 4); + v_float32x4 v_dst4 = blend(v_src102, v_src202, weights1, weights2, weight_offset + 8); + v_float32x4 v_dst5 = blend(v_src112, v_src212, weights1, weights2, weight_offset + 8); + v_float32x4 v_dst6 = blend(v_src103, v_src203, weights1, weights2, weight_offset + 12); + v_float32x4 v_dst7 = blend(v_src113, v_src213, weights1, weights2, weight_offset + 12); + + v_uint8x16 v_dsta = pack_f32tou8(v_dst0, v_dst2, v_dst4, v_dst6); + v_uint8x16 v_dstb = pack_f32tou8(v_dst1, v_dst3, v_dst5, v_dst7); + v_store_interleave(dst + x, v_dsta, v_dstb); + } + break; + case 3: + for( ; x <= width - step; x += step, weight_offset += weight_step) + { + v_uint8x16 v_src10, v_src11, v_src12, v_src20, v_src21, v_src22; + v_load_deinterleave(src1 + x, v_src10, v_src11, v_src12); + v_load_deinterleave(src2 + x, v_src20, v_src21, v_src22); + + v_float32x4 v_src100, v_src101, v_src102, v_src103, v_src110, v_src111, v_src112, v_src113, v_src120, v_src121, v_src122, v_src123; + v_float32x4 v_src200, v_src201, v_src202, v_src203, v_src210, v_src211, v_src212, v_src213, v_src220, v_src221, v_src222, v_src223; + expand_u8tof32(v_src10, v_src100, v_src101, v_src102, v_src103); + expand_u8tof32(v_src11, v_src110, v_src111, v_src112, v_src113); + expand_u8tof32(v_src12, v_src120, v_src121, v_src122, v_src123); + expand_u8tof32(v_src20, v_src200, v_src201, v_src202, v_src203); + expand_u8tof32(v_src21, v_src210, v_src211, v_src212, v_src213); + expand_u8tof32(v_src22, v_src220, v_src221, v_src222, v_src223); + + v_float32x4 v_w10 = v_load(weights1 + weight_offset); + v_float32x4 v_w11 = v_load(weights1 + weight_offset + 4); + v_float32x4 v_w12 = v_load(weights1 + weight_offset + 8); + v_float32x4 v_w13 = v_load(weights1 + weight_offset + 12); + v_float32x4 v_w20 = v_load(weights2 + weight_offset); + v_float32x4 v_w21 = v_load(weights2 + weight_offset + 4); + v_float32x4 v_w22 = v_load(weights2 + weight_offset + 8); + v_float32x4 v_w23 = v_load(weights2 + weight_offset + 12); + v_src100 = blend(v_src100, v_src200, v_w10, v_w20); + v_src110 = blend(v_src110, v_src210, v_w10, v_w20); + v_src120 = blend(v_src120, v_src220, v_w10, v_w20); + v_src101 = blend(v_src101, v_src201, v_w11, v_w21); + v_src111 = blend(v_src111, v_src211, v_w11, v_w21); + v_src121 = blend(v_src121, v_src221, v_w11, v_w21); + v_src102 = blend(v_src102, v_src202, v_w12, v_w22); + v_src112 = blend(v_src112, v_src212, v_w12, v_w22); + v_src122 = blend(v_src122, v_src222, v_w12, v_w22); + v_src103 = blend(v_src103, v_src203, v_w13, v_w23); + v_src113 = blend(v_src113, v_src213, v_w13, v_w23); + v_src123 = blend(v_src123, v_src223, v_w13, v_w23); + + + v_uint8x16 v_dst0 = pack_f32tou8(v_src100, v_src101, v_src102, v_src103); + v_uint8x16 v_dst1 = pack_f32tou8(v_src110, v_src111, v_src112, v_src113); + v_uint8x16 v_dst2 = pack_f32tou8(v_src120, v_src121, v_src122, v_src123); + + v_store_interleave(dst + x, v_dst0, v_dst1, v_dst2); + } + break; + case 4: + step = v_uint8x16::nlanes; + weight_step = v_float32x4::nlanes; + for( ; x <= width - step; x += step, weight_offset += weight_step) + { + v_float32x4 v_src10, v_src11, v_src12, v_src13, v_src14, v_src15, v_src16, v_src17; + v_float32x4 v_src20, v_src21, v_src22, v_src23, v_src24, v_src25, v_src26, v_src27; + load_expand_u8tof32(src1 + x, v_src10, v_src11, v_src12, v_src13); + load_expand_u8tof32(src2 + x, v_src20, v_src21, v_src22, v_src23); + v_transpose4x4(v_src10, v_src11, v_src12, v_src13, v_src14, v_src15, v_src16, v_src17); + v_transpose4x4(v_src20, v_src21, v_src22, v_src23, v_src24, v_src25, v_src26, v_src27); + + v_float32x4 v_w1 = v_load(weights1 + weight_offset); + v_float32x4 v_w2 = v_load(weights2 + weight_offset); + v_float32x4 v_denom = v_w1 + v_w2 + v_eps; + v_src10 = (v_src14 * v_w1 + v_src24 * v_w2) / v_denom; + v_src11 = (v_src15 * v_w1 + v_src25 * v_w2) / v_denom; + v_src12 = (v_src16 * v_w1 + v_src26 * v_w2) / v_denom; + v_src13 = (v_src17 * v_w1 + v_src27 * v_w2) / v_denom; + v_float32x4 v_dst0, v_dst1, v_dst2, v_dst3; + v_transpose4x4(v_src10, v_src11, v_src12, v_src13, v_dst0, v_dst1, v_dst2, v_dst3); + + store_pack_f32tou8(dst + x, v_dst0, v_dst1, v_dst2, v_dst3); + } + break; + default: + break; + } + return x; +} + +int blendLinearSimd128(const float* src1, const float* src2, const float* weights1, const float* weights2, float* dst, int x, int width, int cn) +{ + const v_float32x4 v_eps = v_setall_f32(1e-5f); + int weight_offset = 0; + int step = v_float32x4::nlanes*cn; + switch(cn) + { + case 1: + for( ; x <= width - step; x += step, weight_offset += v_float32x4::nlanes) + { + v_float32x4 v_src1 = v_load(src1 + x); + v_float32x4 v_src2 = v_load(src2 + x); + v_float32x4 v_w1 = v_load(weights1 + weight_offset); + v_float32x4 v_w2 = v_load(weights2 + weight_offset); + v_float32x4 v_denom = v_w1 + v_w2 + v_eps; + v_float32x4 v_dst = (v_src1 * v_w1 + v_src2 * v_w2) / v_denom; + + v_store(dst + x, v_dst); + } + break; + case 2: + for( ; x <= width - step; x += step, weight_offset += v_float32x4::nlanes) + { + v_float32x4 v_src10, v_src11, v_src20, v_src21; + v_load_deinterleave(src1 + x, v_src10, v_src11); + v_load_deinterleave(src2 + x, v_src20, v_src21); + v_float32x4 v_w1 = v_load(weights1 + weight_offset); + v_float32x4 v_w2 = v_load(weights2 + weight_offset); + v_float32x4 v_denom = v_w1 + v_w2 + v_eps; + v_float32x4 v_dst0 = (v_src10 * v_w1 + v_src20 * v_w2) / v_denom; + v_float32x4 v_dst1 = (v_src11 * v_w1 + v_src21 * v_w2) / v_denom; + + v_store_interleave(dst + x, v_dst0, v_dst1); + } + break; + case 3: + for( ; x <= width - step; x += step, weight_offset += v_float32x4::nlanes) + { + v_float32x4 v_src10, v_src11, v_src12, v_src20, v_src21, v_src22; + v_load_deinterleave(src1 + x, v_src10, v_src11, v_src12); + v_load_deinterleave(src2 + x, v_src20, v_src21, v_src22); + v_float32x4 v_w1 = v_load(weights1 + weight_offset); + v_float32x4 v_w2 = v_load(weights2 + weight_offset); + v_float32x4 v_denom = v_w1 + v_w2 + v_eps; + v_float32x4 v_dst0 = (v_src10 * v_w1 + v_src20 * v_w2) / v_denom; + v_float32x4 v_dst1 = (v_src11 * v_w1 + v_src21 * v_w2) / v_denom; + v_float32x4 v_dst2 = (v_src12 * v_w1 + v_src22 * v_w2) / v_denom; + + v_store_interleave(dst + x, v_dst0, v_dst1, v_dst2); + } + break; + case 4: + for( ; x <= width - step; x += step, weight_offset += v_float32x4::nlanes) + { + v_float32x4 v_src10, v_src11, v_src12, v_src13, v_src20, v_src21, v_src22, v_src23; + v_load_deinterleave(src1 + x, v_src10, v_src11, v_src12, v_src13); + v_load_deinterleave(src2 + x, v_src20, v_src21, v_src22, v_src23); + v_float32x4 v_w1 = v_load(weights1 + weight_offset); + v_float32x4 v_w2 = v_load(weights2 + weight_offset); + v_float32x4 v_denom = v_w1 + v_w2 + v_eps; + v_float32x4 v_dst0 = (v_src10 * v_w1 + v_src20 * v_w2) / v_denom; + v_float32x4 v_dst1 = (v_src11 * v_w1 + v_src21 * v_w2) / v_denom; + v_float32x4 v_dst2 = (v_src12 * v_w1 + v_src22 * v_w2) / v_denom; + v_float32x4 v_dst3 = (v_src13 * v_w1 + v_src23 * v_w2) / v_denom; + + v_store_interleave(dst + x, v_dst0, v_dst1, v_dst2, v_dst3); + } + break; + default: + break; + } + return x; +} +#endif template class BlendLinearInvoker : @@ -71,7 +323,12 @@ public: const T * const src2_row = src2->ptr(y); T * const dst_row = dst->ptr(y); - for (int x = 0; x < width; ++x) + int x = 0; + #if CV_SIMD128 + x = blendLinearSimd128(src1_row, src2_row, weights1_row, weights2_row, dst_row, x, width, cn); + #endif + + for ( ; x < width; ++x) { int x1 = x / cn; float w1 = weights1_row[x1], w2 = weights2_row[x1];