Merge pull request #18983 from Yosshi999:bitexact-gaussian-16U-faster

support SIMD for larger symmetric Bit-exact 16U gaussian blur * support SIMD for bit-exact 16U symmetric gaussian blur * use tighter SIMD registers
4 years ago · fdeac73a59
parent d6a7f5e1e0
commit fdeac73a59
1 changed files with 128 additions and 0 deletions
--- a/modules/imgproc/src/smooth.simd.hpp
+++ b/modules/imgproc/src/smooth.simd.hpp
@ -1197,6 +1197,78 @@ void hlineSmoothONa_yzy_a<uint8_t, ufixedpoint16>(const uint8_t* src, int cn, co
            }
    }
 }
+template <>
+void hlineSmoothONa_yzy_a<uint16_t, ufixedpoint32>(const uint16_t* src, int cn, const ufixedpoint32* m, int n, ufixedpoint32* dst, int len, int borderType)
+{
+    int pre_shift = n / 2;
+    int post_shift = n - pre_shift;
+    int i = 0;
+    for (; i < min(pre_shift, len); i++, dst += cn) // Points that fall left from border
+    {
+        for (int k = 0; k < cn; k++)
+            dst[k] = m[pre_shift - i] * src[k];
+        if (borderType != BORDER_CONSTANT)// If BORDER_CONSTANT out of border values are equal to zero and could be skipped
+            for (int j = i - pre_shift, mid = 0; j < 0; j++, mid++)
+            {
+                int src_idx = borderInterpolate(j, len, borderType);
+                for (int k = 0; k < cn; k++)
+                    dst[k] = dst[k] + m[mid] * src[src_idx*cn + k];
+            }
+        int j, mid;
+        for (j = 1, mid = pre_shift - i + 1; j < min(i + post_shift, len); j++, mid++)
+            for (int k = 0; k < cn; k++)
+                dst[k] = dst[k] + m[mid] * src[j*cn + k];
+        if (borderType != BORDER_CONSTANT)
+            for (; j < i + post_shift; j++, mid++)
+            {
+                int src_idx = borderInterpolate(j, len, borderType);
+                for (int k = 0; k < cn; k++)
+                    dst[k] = dst[k] + m[mid] * src[src_idx*cn + k];
+            }
+    }
+    i *= cn;
+    int lencn = (len - post_shift + 1)*cn;
+#if CV_SIMD
+    const int VECSZ = v_uint32::nlanes;
+    for (; i <= lencn - VECSZ * 2; i += VECSZ * 2, src += VECSZ * 2, dst += VECSZ * 2)
+    {
+        v_uint32 v_res0, v_res1;
+        v_mul_expand(vx_load(src + pre_shift * cn), vx_setall_u16((uint16_t) *((uint32_t*)(m + pre_shift))), v_res0, v_res1);
+        for (int j = 0; j < pre_shift; j ++)
+        {
+            v_uint32 v_add0, v_add1;
+            v_mul_expand(vx_load(src + j * cn) + vx_load(src + (n - 1 - j)*cn), vx_setall_u16((uint16_t) *((uint32_t*)(m + j))), v_add0, v_add1);
+            v_res0 += v_add0;
+            v_res1 += v_add1;
+        }
+        v_store((uint32_t*)dst, v_res0);
+        v_store((uint32_t*)dst + VECSZ, v_res1);
+    }
+#endif
+    for (; i < lencn; i++, src++, dst++)
+    {
+        *dst = m[pre_shift] * src[pre_shift*cn];
+        for (int j = 0; j < pre_shift; j++)
+            *dst = *dst + m[j] * src[j*cn] + m[j] * src[(n - 1 - j)*cn];
+    }
+    i /= cn;
+    for (i -= pre_shift; i < len - pre_shift; i++, src += cn, dst += cn) // Points that fall right from border
+    {
+        for (int k = 0; k < cn; k++)
+            dst[k] = m[0] * src[k];
+        int j = 1;
+        for (; j < len - i; j++)
+            for (int k = 0; k < cn; k++)
+                dst[k] = dst[k] + m[j] * src[j*cn + k];
+        if (borderType != BORDER_CONSTANT)// If BORDER_CONSTANT out of border values are equal to zero and could be skipped
+            for (; j < n; j++)
+            {
+                int src_idx = borderInterpolate(i + j, len, borderType) - i;
+                for (int k = 0; k < cn; k++)
+                    dst[k] = dst[k] + m[j] * src[src_idx*cn + k];
+            }
+    }
+}
 template <typename ET, typename FT>
 void vlineSmooth1N(const FT* const * src, const FT* m, int, ET* dst, int len)
 {
@ -1788,6 +1860,62 @@ void vlineSmoothONa_yzy_a<uint8_t, ufixedpoint16>(const ufixedpoint16* const * s
        dst[i] = val;
    }
 }
+template <>
+void vlineSmoothONa_yzy_a<uint16_t, ufixedpoint32>(const ufixedpoint32* const * src, const ufixedpoint32* m, int n, uint16_t* dst, int len)
+{
+    int i = 0;
+#if CV_SIMD
+    int pre_shift = n / 2;
+    const int VECSZ = v_uint32::nlanes;
+    for (; i <= len - 2*VECSZ; i += 2*VECSZ)
+    {
+        v_uint32 v_src00, v_src10, v_src01, v_src11;
+        v_uint64 v_res0, v_res1, v_res2, v_res3;
+        v_uint64 v_tmp0, v_tmp1, v_tmp2, v_tmp3, v_tmp4, v_tmp5, v_tmp6, v_tmp7;
+
+        v_uint32 v_mul = vx_setall_u32(*((uint32_t*)(m + pre_shift)));
+        const uint32_t* srcp = (const uint32_t*)src[pre_shift] + i;
+        v_src00 = vx_load(srcp);
+        v_src10 = vx_load(srcp + VECSZ);
+        v_mul_expand(v_src00, v_mul, v_res0, v_res1);
+        v_mul_expand(v_src10, v_mul, v_res2, v_res3);
+
+        int j = 0;
+        for (; j < pre_shift; j++)
+        {
+            v_mul = vx_setall_u32(*((uint32_t*)(m + j)));
+
+            const uint32_t* srcj0 = (const uint32_t*)src[j] + i;
+            const uint32_t* srcj1 = (const uint32_t*)src[n - 1 - j] + i;
+            v_src00 = vx_load(srcj0);
+            v_src01 = vx_load(srcj1);
+            v_mul_expand(v_src00, v_mul, v_tmp0, v_tmp1);
+            v_mul_expand(v_src01, v_mul, v_tmp2, v_tmp3);
+            v_res0 += v_tmp0 + v_tmp2;
+            v_res1 += v_tmp1 + v_tmp3;
+
+            v_src10 = vx_load(srcj0 + VECSZ);
+            v_src11 = vx_load(srcj1 + VECSZ);
+            v_mul_expand(v_src10, v_mul, v_tmp4, v_tmp5);
+            v_mul_expand(v_src11, v_mul, v_tmp6, v_tmp7);
+            v_res2 += v_tmp4 + v_tmp6;
+            v_res3 += v_tmp5 + v_tmp7;
+        }
+
+        v_store(dst + i, v_pack(v_rshr_pack<32>(v_res0, v_res1),
+                                v_rshr_pack<32>(v_res2, v_res3)));
+    }
+#endif
+    for (; i < len; i++)
+    {
+        ufixedpoint64 val = m[0] * src[0][i];
+        for (int j = 1; j < n; j++)
+        {
+            val = val + m[j] * src[j][i];
+        }
+        dst[i] = (uint16_t)val;
+    }
+}
 template <typename ET, typename FT>
 class fixedSmoothInvoker : public ParallelLoopBody
 {