Morphology reworked to use wide universal intrinsics

6 years ago · 012e43de4b
parent 7812a805ad
commit 012e43de4b
2 changed files with 211 additions and 368 deletions
--- a/modules/imgproc/src/filter.cpp
+++ b/modules/imgproc/src/filter.cpp
@ -213,7 +213,7 @@ int FilterEngine::start(const Size &_wholeSize, const Size &sz, const Point &ofs
    }

    // adjust bufstep so that the used part of the ring buffer stays compact in memory
-    bufStep = bufElemSize*(int)alignSize(roi.width + (!isSeparable() ? ksize.width - 1 : 0),16);
+    bufStep = bufElemSize*(int)alignSize(roi.width + (!isSeparable() ? ksize.width - 1 : 0),VEC_ALIGN);

    dx1 = std::max(anchor.x - roi.x, 0);
    dx2 = std::max(ksize.width - anchor.x - 1 + roi.x + roi.width - wholeSize.width, 0);
--- a/modules/imgproc/src/morph.cpp
+++ b/modules/imgproc/src/morph.cpp
@ -45,6 +45,7 @@
 #include "opencl_kernels_imgproc.hpp"
 #include <iostream>
 #include "hal_replacement.hpp"
+#include "opencv2/core/hal/intrin.hpp"
 #include <opencv2/core/utils/configuration.private.hpp>

 /****************************************************************************************\
@ -97,73 +98,65 @@ struct MorphNoVec
    int operator()(uchar**, int, uchar*, int) const { return 0; }
 };

-#if CV_SSE2
+#if CV_SIMD

-template<class VecUpdate> struct MorphRowIVec
+template<class VecUpdate> struct MorphRowVec
 {
-    enum { ESZ = VecUpdate::ESZ };
-
-    MorphRowIVec(int _ksize, int _anchor) : ksize(_ksize), anchor(_anchor) {}
+    typedef typename VecUpdate::vtype vtype;
+    typedef typename vtype::lane_type stype;
+    MorphRowVec(int _ksize, int _anchor) : ksize(_ksize), anchor(_anchor) {}
    int operator()(const uchar* src, uchar* dst, int width, int cn) const
    {
-        if( !checkHardwareSupport(CV_CPU_SSE2) )
-            return 0;
-
-        cn *= ESZ;
        int i, k, _ksize = ksize*cn;
-        width = (width & -4)*cn;
+        width *= cn;
        VecUpdate updateOp;

-        for( i = 0; i <= width - 16; i += 16 )
+        for( i = 0; i <= width - 4*vtype::nlanes; i += 4*vtype::nlanes )
        {
-            __m128i s = _mm_loadu_si128((const __m128i*)(src + i));
-            for( k = cn; k < _ksize; k += cn )
+            vtype s0 = vx_load((const stype*)src + i);
+            vtype s1 = vx_load((const stype*)src + i + vtype::nlanes);
+            vtype s2 = vx_load((const stype*)src + i + 2*vtype::nlanes);
+            vtype s3 = vx_load((const stype*)src + i + 3*vtype::nlanes);
+            for (k = cn; k < _ksize; k += cn)
            {
-                __m128i x = _mm_loadu_si128((const __m128i*)(src + i + k));
-                s = updateOp(s, x);
+                s0 = updateOp(s0, vx_load((const stype*)src + i + k));
+                s1 = updateOp(s1, vx_load((const stype*)src + i + k + vtype::nlanes));
+                s2 = updateOp(s2, vx_load((const stype*)src + i + k + 2*vtype::nlanes));
+                s3 = updateOp(s3, vx_load((const stype*)src + i + k + 3*vtype::nlanes));
            }
-            _mm_storeu_si128((__m128i*)(dst + i), s);
+            v_store((stype*)dst + i, s0);
+            v_store((stype*)dst + i + vtype::nlanes, s1);
+            v_store((stype*)dst + i + 2*vtype::nlanes, s2);
+            v_store((stype*)dst + i + 3*vtype::nlanes, s3);
        }
-
-        for( ; i < width; i += 4 )
+        if( i <= width - 2*vtype::nlanes )
        {
-            __m128i s = _mm_cvtsi32_si128(*(const int*)(src + i));
+            vtype s0 = vx_load((const stype*)src + i);
+            vtype s1 = vx_load((const stype*)src + i + vtype::nlanes);
            for( k = cn; k < _ksize; k += cn )
            {
-                __m128i x = _mm_cvtsi32_si128(*(const int*)(src + i + k));
-                s = updateOp(s, x);
+                s0 = updateOp(s0, vx_load((const stype*)src + i + k));
+                s1 = updateOp(s1, vx_load((const stype*)src + i + k + vtype::nlanes));
            }
-            *(int*)(dst + i) = _mm_cvtsi128_si32(s);
+            v_store((stype*)dst + i, s0);
+            v_store((stype*)dst + i + vtype::nlanes, s1);
+            i += 2*vtype::nlanes;
        }
-
-        return i/ESZ;
-    }
-
-    int ksize, anchor;
-};
-
-
-template<class VecUpdate> struct MorphRowFVec
-{
-    MorphRowFVec(int _ksize, int _anchor) : ksize(_ksize), anchor(_anchor) {}
-    int operator()(const uchar* src, uchar* dst, int width, int cn) const
-    {
-        if( !checkHardwareSupport(CV_CPU_SSE) )
-            return 0;
-
-        int i, k, _ksize = ksize*cn;
-        width = (width & -4)*cn;
-        VecUpdate updateOp;
-
-        for( i = 0; i < width; i += 4 )
+        if( i <= width - vtype::nlanes )
        {
-            __m128 s = _mm_loadu_ps((const float*)src + i);
+            vtype s = vx_load((const stype*)src + i);
            for( k = cn; k < _ksize; k += cn )
-            {
-                __m128 x = _mm_loadu_ps((const float*)src + i + k);
-                s = updateOp(s, x);
-            }
-            _mm_storeu_ps((float*)dst + i, s);
+                s = updateOp(s, vx_load((const stype*)src + i + k));
+            v_store((stype*)dst + i, s);
+            i += vtype::nlanes;
+        }
+        if( i <= width - vtype::nlanes/2 )
+        {
+            vtype s = vx_load_low((const stype*)src + i);
+            for( k = cn; k < _ksize; k += cn )
+                s = updateOp(s, vx_load_low((const stype*)src + i + k));
+            v_store_low((stype*)dst + i, s);
+            i += vtype::nlanes/2;
        }

        return i;
@ -173,230 +166,156 @@ template<class VecUpdate> struct MorphRowFVec
 };


-template<class VecUpdate> struct MorphColumnIVec
+template<class VecUpdate> struct MorphColumnVec
 {
-    enum { ESZ = VecUpdate::ESZ };
-
-    MorphColumnIVec(int _ksize, int _anchor) : ksize(_ksize), anchor(_anchor) {}
-    int operator()(const uchar** src, uchar* dst, int dststep, int count, int width) const
+    typedef typename VecUpdate::vtype vtype;
+    typedef typename vtype::lane_type stype;
+    MorphColumnVec(int _ksize, int _anchor) : ksize(_ksize), anchor(_anchor) {}
+    int operator()(const uchar** _src, uchar* _dst, int dststep, int count, int width) const
    {
-        if( !checkHardwareSupport(CV_CPU_SSE2) )
-            return 0;
-
        int i = 0, k, _ksize = ksize;
-        width *= ESZ;
        VecUpdate updateOp;

        for( i = 0; i < count + ksize - 1; i++ )
-            CV_Assert( ((size_t)src[i] & 15) == 0 );
+            CV_Assert( ((size_t)_src[i] & (CV_SIMD_WIDTH-1)) == 0 );
+
+        const stype** src = (const stype**)_src;
+        stype* dst = (stype*)_dst;
+        dststep /= sizeof(dst[0]);

        for( ; _ksize > 1 && count > 1; count -= 2, dst += dststep*2, src += 2 )
        {
-            for( i = 0; i <= width - 32; i += 32 )
+            for( i = 0; i <= width - 4*vtype::nlanes; i += 4*vtype::nlanes)
            {
-                const uchar* sptr = src[1] + i;
-                __m128i s0 = _mm_load_si128((const __m128i*)sptr);
-                __m128i s1 = _mm_load_si128((const __m128i*)(sptr + 16));
-                __m128i x0, x1;
+                const stype* sptr = src[1] + i;
+                vtype s0 = vx_load_aligned(sptr);
+                vtype s1 = vx_load_aligned(sptr + vtype::nlanes);
+                vtype s2 = vx_load_aligned(sptr + 2*vtype::nlanes);
+                vtype s3 = vx_load_aligned(sptr + 3*vtype::nlanes);

                for( k = 2; k < _ksize; k++ )
                {
                    sptr = src[k] + i;
-                    x0 = _mm_load_si128((const __m128i*)sptr);
-                    x1 = _mm_load_si128((const __m128i*)(sptr + 16));
-                    s0 = updateOp(s0, x0);
-                    s1 = updateOp(s1, x1);
+                    s0 = updateOp(s0, vx_load_aligned(sptr));
+                    s1 = updateOp(s1, vx_load_aligned(sptr + vtype::nlanes));
+                    s2 = updateOp(s2, vx_load_aligned(sptr + 2*vtype::nlanes));
+                    s3 = updateOp(s3, vx_load_aligned(sptr + 3*vtype::nlanes));
                }

                sptr = src[0] + i;
-                x0 = _mm_load_si128((const __m128i*)sptr);
-                x1 = _mm_load_si128((const __m128i*)(sptr + 16));
-                _mm_storeu_si128((__m128i*)(dst + i), updateOp(s0, x0));
-                _mm_storeu_si128((__m128i*)(dst + i + 16), updateOp(s1, x1));
+                v_store(dst + i, updateOp(s0, vx_load_aligned(sptr)));
+                v_store(dst + i + vtype::nlanes, updateOp(s1, vx_load_aligned(sptr + vtype::nlanes)));
+                v_store(dst + i + 2*vtype::nlanes, updateOp(s2, vx_load_aligned(sptr + 2*vtype::nlanes)));
+                v_store(dst + i + 3*vtype::nlanes, updateOp(s3, vx_load_aligned(sptr + 3*vtype::nlanes)));

                sptr = src[k] + i;
-                x0 = _mm_load_si128((const __m128i*)sptr);
-                x1 = _mm_load_si128((const __m128i*)(sptr + 16));
-                _mm_storeu_si128((__m128i*)(dst + dststep + i), updateOp(s0, x0));
-                _mm_storeu_si128((__m128i*)(dst + dststep + i + 16), updateOp(s1, x1));
+                v_store(dst + dststep + i, updateOp(s0, vx_load_aligned(sptr)));
+                v_store(dst + dststep + i + vtype::nlanes, updateOp(s1, vx_load_aligned(sptr + vtype::nlanes)));
+                v_store(dst + dststep + i + 2*vtype::nlanes, updateOp(s2, vx_load_aligned(sptr + 2*vtype::nlanes)));
+                v_store(dst + dststep + i + 3*vtype::nlanes, updateOp(s3, vx_load_aligned(sptr + 3*vtype::nlanes)));
            }
-
-            for( ; i <= width - 8; i += 8 )
+            if( i <= width - 2*vtype::nlanes )
            {
-                __m128i s0 = _mm_loadl_epi64((const __m128i*)(src[1] + i)), x0;
+                const stype* sptr = src[1] + i;
+                vtype s0 = vx_load_aligned(sptr);
+                vtype s1 = vx_load_aligned(sptr + vtype::nlanes);

                for( k = 2; k < _ksize; k++ )
-                {
-                    x0 = _mm_loadl_epi64((const __m128i*)(src[k] + i));
-                    s0 = updateOp(s0, x0);
-                }
-
-                x0 = _mm_loadl_epi64((const __m128i*)(src[0] + i));
-                _mm_storel_epi64((__m128i*)(dst + i), updateOp(s0, x0));
-                x0 = _mm_loadl_epi64((const __m128i*)(src[k] + i));
-                _mm_storel_epi64((__m128i*)(dst + dststep + i), updateOp(s0, x0));
-            }
-        }
-
-        for( ; count > 0; count--, dst += dststep, src++ )
-        {
-            for( i = 0; i <= width - 32; i += 32 )
-            {
-                const uchar* sptr = src[0] + i;
-                __m128i s0 = _mm_load_si128((const __m128i*)sptr);
-                __m128i s1 = _mm_load_si128((const __m128i*)(sptr + 16));
-                __m128i x0, x1;
-
-                for( k = 1; k < _ksize; k++ )
                {
                    sptr = src[k] + i;
-                    x0 = _mm_load_si128((const __m128i*)sptr);
-                    x1 = _mm_load_si128((const __m128i*)(sptr + 16));
-                    s0 = updateOp(s0, x0);
-                    s1 = updateOp(s1, x1);
+                    s0 = updateOp(s0, vx_load_aligned(sptr));
+                    s1 = updateOp(s1, vx_load_aligned(sptr + vtype::nlanes));
                }
-                _mm_storeu_si128((__m128i*)(dst + i), s0);
-                _mm_storeu_si128((__m128i*)(dst + i + 16), s1);
-            }

-            for( ; i <= width - 8; i += 8 )
-            {
-                __m128i s0 = _mm_loadl_epi64((const __m128i*)(src[0] + i)), x0;
+                sptr = src[0] + i;
+                v_store(dst + i, updateOp(s0, vx_load_aligned(sptr)));
+                v_store(dst + i + vtype::nlanes, updateOp(s1, vx_load_aligned(sptr + vtype::nlanes)));

-                for( k = 1; k < _ksize; k++ )
-                {
-                    x0 = _mm_loadl_epi64((const __m128i*)(src[k] + i));
-                    s0 = updateOp(s0, x0);
-                }
-                _mm_storel_epi64((__m128i*)(dst + i), s0);
+                sptr = src[k] + i;
+                v_store(dst + dststep + i, updateOp(s0, vx_load_aligned(sptr)));
+                v_store(dst + dststep + i + vtype::nlanes, updateOp(s1, vx_load_aligned(sptr + vtype::nlanes)));
+                i += 2*vtype::nlanes;
            }
-        }
-
-        return i/ESZ;
-    }
-
-    int ksize, anchor;
-};
-
-
-template<class VecUpdate> struct MorphColumnFVec
-{
-    MorphColumnFVec(int _ksize, int _anchor) : ksize(_ksize), anchor(_anchor) {}
-    int operator()(const uchar** _src, uchar* _dst, int dststep, int count, int width) const
-    {
-        if( !checkHardwareSupport(CV_CPU_SSE) )
-            return 0;
-
-        int i = 0, k, _ksize = ksize;
-        VecUpdate updateOp;
-
-        for( i = 0; i < count + ksize - 1; i++ )
-            CV_Assert( ((size_t)_src[i] & 15) == 0 );
-
-        const float** src = (const float**)_src;
-        float* dst = (float*)_dst;
-        dststep /= sizeof(dst[0]);
-
-        for( ; _ksize > 1 && count > 1; count -= 2, dst += dststep*2, src += 2 )
-        {
-            for( i = 0; i <= width - 16; i += 16 )
+            if( i <= width - vtype::nlanes )
            {
-                const float* sptr = src[1] + i;
-                __m128 s0 = _mm_load_ps(sptr);
-                __m128 s1 = _mm_load_ps(sptr + 4);
-                __m128 s2 = _mm_load_ps(sptr + 8);
-                __m128 s3 = _mm_load_ps(sptr + 12);
-                __m128 x0, x1, x2, x3;
+                vtype s0 = vx_load_aligned(src[1] + i);

                for( k = 2; k < _ksize; k++ )
-                {
-                    sptr = src[k] + i;
-                    x0 = _mm_load_ps(sptr);
-                    x1 = _mm_load_ps(sptr + 4);
-                    s0 = updateOp(s0, x0);
-                    s1 = updateOp(s1, x1);
-                    x2 = _mm_load_ps(sptr + 8);
-                    x3 = _mm_load_ps(sptr + 12);
-                    s2 = updateOp(s2, x2);
-                    s3 = updateOp(s3, x3);
-                }
-
-                sptr = src[0] + i;
-                x0 = _mm_load_ps(sptr);
-                x1 = _mm_load_ps(sptr + 4);
-                x2 = _mm_load_ps(sptr + 8);
-                x3 = _mm_load_ps(sptr + 12);
-                _mm_storeu_ps(dst + i, updateOp(s0, x0));
-                _mm_storeu_ps(dst + i + 4, updateOp(s1, x1));
-                _mm_storeu_ps(dst + i + 8, updateOp(s2, x2));
-                _mm_storeu_ps(dst + i + 12, updateOp(s3, x3));
+                    s0 = updateOp(s0, vx_load_aligned(src[k] + i));

-                sptr = src[k] + i;
-                x0 = _mm_load_ps(sptr);
-                x1 = _mm_load_ps(sptr + 4);
-                x2 = _mm_load_ps(sptr + 8);
-                x3 = _mm_load_ps(sptr + 12);
-                _mm_storeu_ps(dst + dststep + i, updateOp(s0, x0));
-                _mm_storeu_ps(dst + dststep + i + 4, updateOp(s1, x1));
-                _mm_storeu_ps(dst + dststep + i + 8, updateOp(s2, x2));
-                _mm_storeu_ps(dst + dststep + i + 12, updateOp(s3, x3));
+                v_store(dst + i, updateOp(s0, vx_load_aligned(src[0] + i)));
+                v_store(dst + dststep + i, updateOp(s0, vx_load_aligned(src[k] + i)));
+                i += vtype::nlanes;
            }
-
-            for( ; i <= width - 4; i += 4 )
+            if( i <= width - vtype::nlanes/2 )
            {
-                __m128 s0 = _mm_load_ps(src[1] + i), x0;
+                vtype s0 = vx_load_low(src[1] + i);

                for( k = 2; k < _ksize; k++ )
-                {
-                    x0 = _mm_load_ps(src[k] + i);
-                    s0 = updateOp(s0, x0);
-                }
+                    s0 = updateOp(s0, vx_load_low(src[k] + i));

-                x0 = _mm_load_ps(src[0] + i);
-                _mm_storeu_ps(dst + i, updateOp(s0, x0));
-                x0 = _mm_load_ps(src[k] + i);
-                _mm_storeu_ps(dst + dststep + i, updateOp(s0, x0));
+                v_store_low(dst + i, updateOp(s0, vx_load_low(src[0] + i)));
+                v_store_low(dst + dststep + i, updateOp(s0, vx_load_low(src[k] + i)));
+                i += vtype::nlanes/2;
            }
        }

        for( ; count > 0; count--, dst += dststep, src++ )
        {
-            for( i = 0; i <= width - 16; i += 16 )
+            for( i = 0; i <= width - 4*vtype::nlanes; i += 4*vtype::nlanes)
            {
-                const float* sptr = src[0] + i;
-                __m128 s0 = _mm_load_ps(sptr);
-                __m128 s1 = _mm_load_ps(sptr + 4);
-                __m128 s2 = _mm_load_ps(sptr + 8);
-                __m128 s3 = _mm_load_ps(sptr + 12);
-                __m128 x0, x1, x2, x3;
+                const stype* sptr = src[0] + i;
+                vtype s0 = vx_load_aligned(sptr);
+                vtype s1 = vx_load_aligned(sptr + vtype::nlanes);
+                vtype s2 = vx_load_aligned(sptr + 2*vtype::nlanes);
+                vtype s3 = vx_load_aligned(sptr + 3*vtype::nlanes);

                for( k = 1; k < _ksize; k++ )
                {
                    sptr = src[k] + i;
-                    x0 = _mm_load_ps(sptr);
-                    x1 = _mm_load_ps(sptr + 4);
-                    s0 = updateOp(s0, x0);
-                    s1 = updateOp(s1, x1);
-                    x2 = _mm_load_ps(sptr + 8);
-                    x3 = _mm_load_ps(sptr + 12);
-                    s2 = updateOp(s2, x2);
-                    s3 = updateOp(s3, x3);
+                    s0 = updateOp(s0, vx_load_aligned(sptr));
+                    s1 = updateOp(s1, vx_load_aligned(sptr + vtype::nlanes));
+                    s2 = updateOp(s2, vx_load_aligned(sptr + 2*vtype::nlanes));
+                    s3 = updateOp(s3, vx_load_aligned(sptr + 3*vtype::nlanes));
                }
-                _mm_storeu_ps(dst + i, s0);
-                _mm_storeu_ps(dst + i + 4, s1);
-                _mm_storeu_ps(dst + i + 8, s2);
-                _mm_storeu_ps(dst + i + 12, s3);
+                v_store(dst + i, s0);
+                v_store(dst + i + vtype::nlanes, s1);
+                v_store(dst + i + 2*vtype::nlanes, s2);
+                v_store(dst + i + 3*vtype::nlanes, s3);
            }
-
-            for( i = 0; i <= width - 4; i += 4 )
+            if( i <= width - 2*vtype::nlanes )
            {
-                __m128 s0 = _mm_load_ps(src[0] + i), x0;
+                const stype* sptr = src[0] + i;
+                vtype s0 = vx_load_aligned(sptr);
+                vtype s1 = vx_load_aligned(sptr + vtype::nlanes);
+
                for( k = 1; k < _ksize; k++ )
                {
-                    x0 = _mm_load_ps(src[k] + i);
-                    s0 = updateOp(s0, x0);
+                    sptr = src[k] + i;
+                    s0 = updateOp(s0, vx_load_aligned(sptr));
+                    s1 = updateOp(s1, vx_load_aligned(sptr + vtype::nlanes));
                }
-                _mm_storeu_ps(dst + i, s0);
+                v_store(dst + i, s0);
+                v_store(dst + i + vtype::nlanes, s1);
+                i += 2*vtype::nlanes;
+            }
+            if( i <= width - vtype::nlanes )
+            {
+                vtype s0 = vx_load_aligned(src[0] + i);
+
+                for( k = 1; k < _ksize; k++ )
+                    s0 = updateOp(s0, vx_load_aligned(src[k] + i));
+                v_store(dst + i, s0);
+                i += vtype::nlanes;
+            }
+            if( i <= width - vtype::nlanes/2 )
+            {
+                vtype s0 = vx_load_low(src[0] + i);
+
+                for( k = 1; k < _ksize; k++ )
+                    s0 = updateOp(s0, vx_load_low(src[k] + i));
+                v_store_low(dst + i, s0);
+                i += vtype::nlanes/2;
            }
        }

@ -407,185 +326,109 @@ template<class VecUpdate> struct MorphColumnFVec
 };


-template<class VecUpdate> struct MorphIVec
+template<class VecUpdate> struct MorphVec
 {
-    enum { ESZ = VecUpdate::ESZ };
-
-    int operator()(uchar** src, int nz, uchar* dst, int width) const
+    typedef typename VecUpdate::vtype vtype;
+    typedef typename vtype::lane_type stype;
+    int operator()(uchar** _src, int nz, uchar* _dst, int width) const
    {
-        if( !checkHardwareSupport(CV_CPU_SSE2) )
-            return 0;
-
+        const stype** src = (const stype**)_src;
+        stype* dst = (stype*)_dst;
        int i, k;
-        width *= ESZ;
        VecUpdate updateOp;

-        for( i = 0; i <= width - 32; i += 32 )
+        for( i = 0; i <= width - 4*vtype::nlanes; i += 4*vtype::nlanes )
        {
-            const uchar* sptr = src[0] + i;
-            __m128i s0 = _mm_loadu_si128((const __m128i*)sptr);
-            __m128i s1 = _mm_loadu_si128((const __m128i*)(sptr + 16));
-            __m128i x0, x1;
-
+            const stype* sptr = src[0] + i;
+            vtype s0 = vx_load(sptr);
+            vtype s1 = vx_load(sptr + vtype::nlanes);
+            vtype s2 = vx_load(sptr + 2*vtype::nlanes);
+            vtype s3 = vx_load(sptr + 3*vtype::nlanes);
            for( k = 1; k < nz; k++ )
            {
                sptr = src[k] + i;
-                x0 = _mm_loadu_si128((const __m128i*)sptr);
-                x1 = _mm_loadu_si128((const __m128i*)(sptr + 16));
-                s0 = updateOp(s0, x0);
-                s1 = updateOp(s1, x1);
-            }
-            _mm_storeu_si128((__m128i*)(dst + i), s0);
-            _mm_storeu_si128((__m128i*)(dst + i + 16), s1);
-        }
-
-        for( ; i <= width - 8; i += 8 )
-        {
-            __m128i s0 = _mm_loadl_epi64((const __m128i*)(src[0] + i)), x0;
-
-            for( k = 1; k < nz; k++ )
-            {
-                x0 = _mm_loadl_epi64((const __m128i*)(src[k] + i));
-                s0 = updateOp(s0, x0);
+                s0 = updateOp(s0, vx_load(sptr));
+                s1 = updateOp(s1, vx_load(sptr + vtype::nlanes));
+                s2 = updateOp(s2, vx_load(sptr + 2*vtype::nlanes));
+                s3 = updateOp(s3, vx_load(sptr + 3*vtype::nlanes));
            }
-            _mm_storel_epi64((__m128i*)(dst + i), s0);
+            v_store(dst + i, s0);
+            v_store(dst + i + vtype::nlanes, s1);
+            v_store(dst + i + 2*vtype::nlanes, s2);
+            v_store(dst + i + 3*vtype::nlanes, s3);
        }
-
-        return i/ESZ;
-    }
-};
-
-
-template<class VecUpdate> struct MorphFVec
-{
-    int operator()(uchar** _src, int nz, uchar* _dst, int width) const
-    {
-        if( !checkHardwareSupport(CV_CPU_SSE) )
-            return 0;
-
-        const float** src = (const float**)_src;
-        float* dst = (float*)_dst;
-        int i, k;
-        VecUpdate updateOp;
-
-        for( i = 0; i <= width - 16; i += 16 )
+        if( i <= width - 2*vtype::nlanes )
        {
-            const float* sptr = src[0] + i;
-            __m128 s0 = _mm_loadu_ps(sptr);
-            __m128 s1 = _mm_loadu_ps(sptr + 4);
-            __m128 s2 = _mm_loadu_ps(sptr + 8);
-            __m128 s3 = _mm_loadu_ps(sptr + 12);
-            __m128 x0, x1, x2, x3;
-
+            const stype* sptr = src[0] + i;
+            vtype s0 = vx_load(sptr);
+            vtype s1 = vx_load(sptr + vtype::nlanes);
            for( k = 1; k < nz; k++ )
            {
                sptr = src[k] + i;
-                x0 = _mm_loadu_ps(sptr);
-                x1 = _mm_loadu_ps(sptr + 4);
-                x2 = _mm_loadu_ps(sptr + 8);
-                x3 = _mm_loadu_ps(sptr + 12);
-                s0 = updateOp(s0, x0);
-                s1 = updateOp(s1, x1);
-                s2 = updateOp(s2, x2);
-                s3 = updateOp(s3, x3);
+                s0 = updateOp(s0, vx_load(sptr));
+                s1 = updateOp(s1, vx_load(sptr + vtype::nlanes));
            }
-            _mm_storeu_ps(dst + i, s0);
-            _mm_storeu_ps(dst + i + 4, s1);
-            _mm_storeu_ps(dst + i + 8, s2);
-            _mm_storeu_ps(dst + i + 12, s3);
+            v_store(dst + i, s0);
+            v_store(dst + i + vtype::nlanes, s1);
+            i += 2*vtype::nlanes;
        }
-
-        for( ; i <= width - 4; i += 4 )
+        if( i <= width - vtype::nlanes )
        {
-            __m128 s0 = _mm_loadu_ps(src[0] + i), x0;
-
+            vtype s0 = vx_load(src[0] + i);
            for( k = 1; k < nz; k++ )
-            {
-                x0 = _mm_loadu_ps(src[k] + i);
-                s0 = updateOp(s0, x0);
-            }
-            _mm_storeu_ps(dst + i, s0);
+                s0 = updateOp(s0, vx_load(src[k] + i));
+            v_store(dst + i, s0);
+            i += vtype::nlanes;
        }
-
-        for( ; i < width; i++ )
+        if( i <= width - vtype::nlanes/2 )
        {
-            __m128 s0 = _mm_load_ss(src[0] + i), x0;
-
+            vtype s0 = vx_load_low(src[0] + i);
            for( k = 1; k < nz; k++ )
-            {
-                x0 = _mm_load_ss(src[k] + i);
-                s0 = updateOp(s0, x0);
-            }
-            _mm_store_ss(dst + i, s0);
+                s0 = updateOp(s0, vx_load_low(src[k] + i));
+            v_store_low(dst + i, s0);
+            i += vtype::nlanes/2;
        }
-
        return i;
    }
 };

-struct VMin8u
-{
-    enum { ESZ = 1 };
-    __m128i operator()(const __m128i& a, const __m128i& b) const { return _mm_min_epu8(a,b); }
-};
-struct VMax8u
-{
-    enum { ESZ = 1 };
-    __m128i operator()(const __m128i& a, const __m128i& b) const { return _mm_max_epu8(a,b); }
-};
-struct VMin16u
-{
-    enum { ESZ = 2 };
-    __m128i operator()(const __m128i& a, const __m128i& b) const
-    { return _mm_subs_epu16(a,_mm_subs_epu16(a,b)); }
-};
-struct VMax16u
-{
-    enum { ESZ = 2 };
-    __m128i operator()(const __m128i& a, const __m128i& b) const
-    { return _mm_adds_epu16(_mm_subs_epu16(a,b), b); }
-};
-struct VMin16s
+template <typename T> struct VMin
 {
-    enum { ESZ = 2 };
-    __m128i operator()(const __m128i& a, const __m128i& b) const
-    { return _mm_min_epi16(a, b); }
+    typedef T vtype;
+    vtype operator()(const vtype& a, const vtype& b) const { return v_min(a,b); }
 };
-struct VMax16s
+template <typename T> struct VMax
 {
-    enum { ESZ = 2 };
-    __m128i operator()(const __m128i& a, const __m128i& b) const
-    { return _mm_max_epi16(a, b); }
+    typedef T vtype;
+    vtype operator()(const vtype& a, const vtype& b) const { return v_max(a,b); }
 };
-struct VMin32f { __m128 operator()(const __m128& a, const __m128& b) const { return _mm_min_ps(a,b); }};
-struct VMax32f { __m128 operator()(const __m128& a, const __m128& b) const { return _mm_max_ps(a,b); }};
-
-typedef MorphRowIVec<VMin8u> ErodeRowVec8u;
-typedef MorphRowIVec<VMax8u> DilateRowVec8u;
-typedef MorphRowIVec<VMin16u> ErodeRowVec16u;
-typedef MorphRowIVec<VMax16u> DilateRowVec16u;
-typedef MorphRowIVec<VMin16s> ErodeRowVec16s;
-typedef MorphRowIVec<VMax16s> DilateRowVec16s;
-typedef MorphRowFVec<VMin32f> ErodeRowVec32f;
-typedef MorphRowFVec<VMax32f> DilateRowVec32f;
-
-typedef MorphColumnIVec<VMin8u> ErodeColumnVec8u;
-typedef MorphColumnIVec<VMax8u> DilateColumnVec8u;
-typedef MorphColumnIVec<VMin16u> ErodeColumnVec16u;
-typedef MorphColumnIVec<VMax16u> DilateColumnVec16u;
-typedef MorphColumnIVec<VMin16s> ErodeColumnVec16s;
-typedef MorphColumnIVec<VMax16s> DilateColumnVec16s;
-typedef MorphColumnFVec<VMin32f> ErodeColumnVec32f;
-typedef MorphColumnFVec<VMax32f> DilateColumnVec32f;
-
-typedef MorphIVec<VMin8u> ErodeVec8u;
-typedef MorphIVec<VMax8u> DilateVec8u;
-typedef MorphIVec<VMin16u> ErodeVec16u;
-typedef MorphIVec<VMax16u> DilateVec16u;
-typedef MorphIVec<VMin16s> ErodeVec16s;
-typedef MorphIVec<VMax16s> DilateVec16s;
-typedef MorphFVec<VMin32f> ErodeVec32f;
-typedef MorphFVec<VMax32f> DilateVec32f;
+
+typedef MorphRowVec<VMin<v_uint8> > ErodeRowVec8u;
+typedef MorphRowVec<VMax<v_uint8> > DilateRowVec8u;
+typedef MorphRowVec<VMin<v_uint16> > ErodeRowVec16u;
+typedef MorphRowVec<VMax<v_uint16> > DilateRowVec16u;
+typedef MorphRowVec<VMin<v_int16> > ErodeRowVec16s;
+typedef MorphRowVec<VMax<v_int16> > DilateRowVec16s;
+typedef MorphRowVec<VMin<v_float32> > ErodeRowVec32f;
+typedef MorphRowVec<VMax<v_float32> > DilateRowVec32f;
+
+typedef MorphColumnVec<VMin<v_uint8> > ErodeColumnVec8u;
+typedef MorphColumnVec<VMax<v_uint8> > DilateColumnVec8u;
+typedef MorphColumnVec<VMin<v_uint16> > ErodeColumnVec16u;
+typedef MorphColumnVec<VMax<v_uint16> > DilateColumnVec16u;
+typedef MorphColumnVec<VMin<v_int16> > ErodeColumnVec16s;
+typedef MorphColumnVec<VMax<v_int16> > DilateColumnVec16s;
+typedef MorphColumnVec<VMin<v_float32> > ErodeColumnVec32f;
+typedef MorphColumnVec<VMax<v_float32> > DilateColumnVec32f;
+
+typedef MorphVec<VMin<v_uint8> > ErodeVec8u;
+typedef MorphVec<VMax<v_uint8> > DilateVec8u;
+typedef MorphVec<VMin<v_uint16> > ErodeVec16u;
+typedef MorphVec<VMax<v_uint16> > DilateVec16u;
+typedef MorphVec<VMin<v_int16> > ErodeVec16s;
+typedef MorphVec<VMax<v_int16> > DilateVec16s;
+typedef MorphVec<VMin<v_float32> > ErodeVec32f;
+typedef MorphVec<VMax<v_float32> > DilateVec32f;

 #else