From 0ba3b06efda3ce30d625799759a48f305cd570c9 Mon Sep 17 00:00:00 2001
From: orestis <orestis@ee.auth.gr>
Date: Fri, 19 Dec 2014 22:02:57 +0200
Subject: [PATCH 01/20] SymmRowSmallVec_8u32s [1, 2, 1] NEON speedup: 4.1
 Auto-vect speedup: 2.25

---
 modules/imgproc/src/filter.cpp | 123 +++++++++++++++++++++++++++++++++
 1 file changed, 123 insertions(+)
diff --git a/modules/imgproc/src/filter.cpp b/modules/imgproc/src/filter.cpp
index 05db957498..9750e51544 100644
--- a/modules/imgproc/src/filter.cpp
+++ b/modules/imgproc/src/filter.cpp
@@ -2207,6 +2207,129 @@ struct FilterVec_32f
 };
 
 
+#elif CV_NEON
+
+struct SymmRowSmallVec_8u32s
+{
+    SymmRowSmallVec_8u32s() { smallValues = false; }
+    SymmRowSmallVec_8u32s( const Mat& _kernel, int _symmetryType )
+    {
+        kernel = _kernel;
+        symmetryType = _symmetryType;
+        smallValues = true;
+        int k, ksize = kernel.rows + kernel.cols - 1;
+        for( k = 0; k < ksize; k++ )
+        {
+            int v = kernel.ptr<int>()[k];
+            if( v < SHRT_MIN || v > SHRT_MAX )
+            {
+                smallValues = false;
+                break;
+            }
+        }
+    }
+
+    int operator()(const uchar* src, uchar* _dst, int width, int cn) const
+    {
+        //Uncomment the two following lines when runtime support for neon is implemented.
+        // if( !checkHardwareSupport(CV_CPU_NEON) )
+        //     return 0;
+
+        int i = 0, _ksize = kernel.rows + kernel.cols - 1;
+        int* dst = (int*)_dst;
+        bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
+        const int* kx = kernel.ptr<int>() + _ksize/2;
+        if( !smallValues )
+            return 0;
+
+        src += (_ksize/2)*cn;
+        width *= cn;
+
+        if( symmetrical )
+        {
+            if( _ksize == 1 )
+                return 0;
+            if( _ksize == 3 )
+            {
+                if( kx[0] == 2 && kx[1] == 1 )
+                {
+                    uint16x8_t zq = vdupq_n_u16(0);
+
+                    for( ; i <= width - 8; i += 8, src += 8 )
+                    {
+                        uint8x8_t x0, x1, x2;
+                        x0 = vld1_u8( (uint8_t *) (src - cn) );
+                        x1 = vld1_u8( (uint8_t *) (src) );
+                        x2 = vld1_u8( (uint8_t *) (src + cn) );
+
+                        uint16x8_t y0, y1, y2;
+                        y0 = vaddl_u8(x0, x2);
+                        y1 = vshll_n_u8(x1, 1);
+                        y2 = vaddq_u16(y0, y1);
+
+                        uint16x8x2_t str;
+                        str.val[0] = y2; str.val[1] = zq;
+                        vst2q_u16( (uint16_t *) (dst + i), str );
+                    }
+                }
+                else if( kx[0] == -2 && kx[1] == 1 )
+                    return 0;
+                else
+                {
+                    return 0;
+                }
+            }
+            else if( _ksize == 5 )
+            {
+                if( kx[0] == -2 && kx[1] == 0 && kx[2] == 1 )
+                    return 0;
+                else
+                {
+                    return 0;
+                }
+            }
+        }
+        else
+        {
+            if( _ksize == 3 )
+            {
+                if( kx[0] == 0 && kx[1] == 1 )
+                {
+                    return 0;
+                else
+                {
+                    return 0;
+                }
+            }
+            else if( _ksize == 5 )
+            {
+                return 0;
+            }
+        }
+
+        return i;
+    }
+
+    Mat kernel;
+    int symmetryType;
+    bool smallValues;
+};
+
+
+typedef RowNoVec RowVec_16s32f;
+typedef RowNoVec RowVec_32f;
+typedef SymmRowSmallNoVec SymmRowSmallVec_8u32s;
+typedef SymmRowSmallNoVec SymmRowSmallVec_32f;
+typedef ColumnNoVec SymmColumnVec_32s8u;
+typedef ColumnNoVec SymmColumnVec_32f16s;
+typedef ColumnNoVec SymmColumnVec_32f;
+typedef SymmColumnSmallNoVec SymmColumnSmallVec_32s16s;
+typedef SymmColumnSmallNoVec SymmColumnSmallVec_32f;
+typedef FilterNoVec FilterVec_8u;
+typedef FilterNoVec FilterVec_8u16s;
+typedef FilterNoVec FilterVec_32f;
+
+
 #else
 
 typedef RowNoVec RowVec_8u32s;

From cb48d7798dcc7851e17aeba0aea374760c03518a Mon Sep 17 00:00:00 2001
From: orestis <orestis@ee.auth.gr>
Date: Fri, 19 Dec 2014 22:08:23 +0200
Subject: [PATCH 02/20] SymmRowSmallVec_8u32s 1x3 general NEON speedup: 2.56x
 Auto-vect speedup: 1.26x

Test kernel: [1, 3, 1]
---
 modules/imgproc/src/filter.cpp | 28 +++++++++++++++++++++++++++-
 1 file changed, 27 insertions(+), 1 deletion(-)

diff --git a/modules/imgproc/src/filter.cpp b/modules/imgproc/src/filter.cpp
index 9750e51544..e53a637c21 100644
--- a/modules/imgproc/src/filter.cpp
+++ b/modules/imgproc/src/filter.cpp
@@ -2276,7 +2276,33 @@ struct SymmRowSmallVec_8u32s
                     return 0;
                 else
                 {
-                    return 0;
+                    int32x4_t k32 = vdupq_n_s32(0);
+                    k32 = vld1q_lane_s32(kx, k32, 0);
+                    k32 = vld1q_lane_s32(kx + 1, k32, 1);
+
+                    int16x4_t k = vqmovn_s32(k32);
+
+                    uint8x8_t z = vdup_n_u8(0);
+
+                    for( ; i <= width - 8; i += 8, src += 8 )
+                    {
+                        uint8x8_t x0, x1, x2;
+                        x0 = vld1_u8( (uint8_t *) (src - cn) );
+                        x1 = vld1_u8( (uint8_t *) (src) );
+                        x2 = vld1_u8( (uint8_t *) (src + cn) );
+
+                        int16x8_t y0, y1;
+                        int32x4_t y2, y3;
+                        y0 = vreinterpretq_s16_u16(vaddl_u8(x1, z));
+                        y1 = vreinterpretq_s16_u16(vaddl_u8(x0, x2));
+                        y2 = vmull_lane_s16(vget_low_s16(y0), k, 0);
+                        y2 = vmlal_lane_s16(y2, vget_low_s16(y1), k, 1);
+                        y3 = vmull_lane_s16(vget_high_s16(y0), k, 0);
+                        y3 = vmlal_lane_s16(y3, vget_high_s16(y1), k, 1);
+
+                        vst1q_s32((int32_t *)(dst + i), y2);
+                        vst1q_s32((int32_t *)(dst + i + 4), y3);
+                    }
                 }
             }
             else if( _ksize == 5 )

From c0019a42e47ea50dcc3dbf1e77e60937243d6fe9 Mon Sep 17 00:00:00 2001
From: orestis <orestis@ee.auth.gr>
Date: Fri, 19 Dec 2014 22:10:58 +0200
Subject: [PATCH 03/20] SymmRowSmallVec_8u32s 1x5 general NEON speedup: 3.86x
 Auto-vect speedup: 1.67x

Test kernel: [0.0708, 0.2445, 0.3694, 0.2445, 0.0708]
---
 modules/imgproc/src/filter.cpp | 36 +++++++++++++++++++++++++++++++++-
 1 file changed, 35 insertions(+), 1 deletion(-)

diff --git a/modules/imgproc/src/filter.cpp b/modules/imgproc/src/filter.cpp
index e53a637c21..54c5ec14ee 100644
--- a/modules/imgproc/src/filter.cpp
+++ b/modules/imgproc/src/filter.cpp
@@ -2311,7 +2311,41 @@ struct SymmRowSmallVec_8u32s
                     return 0;
                 else
                 {
-                    return 0;
+                    int32x4_t k32 = vdupq_n_s32(0);
+                    k32 = vld1q_lane_s32(kx, k32, 0);
+                    k32 = vld1q_lane_s32(kx + 1, k32, 1);
+                    k32 = vld1q_lane_s32(kx + 2, k32, 2);
+
+                    int16x4_t k = vqmovn_s32(k32);
+
+                    uint8x8_t z = vdup_n_u8(0);
+
+                    for( ; i <= width - 8; i += 8, src += 8 )
+                    {
+                        uint8x8_t x0, x1, x2, x3, x4;
+                        x0 = vld1_u8( (uint8_t *) (src - cn) );
+                        x1 = vld1_u8( (uint8_t *) (src) );
+                        x2 = vld1_u8( (uint8_t *) (src + cn) );
+
+                        int16x8_t y0, y1;
+                        int32x4_t accl, acch;
+                        y0 = vreinterpretq_s16_u16(vaddl_u8(x1, z));
+                        y1 = vreinterpretq_s16_u16(vaddl_u8(x0, x2));
+                        accl = vmull_lane_s16(vget_low_s16(y0), k, 0);
+                        accl = vmlal_lane_s16(accl, vget_low_s16(y1), k, 1);
+                        acch = vmull_lane_s16(vget_high_s16(y0), k, 0);
+                        acch = vmlal_lane_s16(acch, vget_high_s16(y1), k, 1);
+
+                        int16x8_t y2;
+                        x3 = vld1_u8( (uint8_t *) (src - cn*2) );
+                        x4 = vld1_u8( (uint8_t *) (src + cn*2) );
+                        y2 = vreinterpretq_s16_u16(vaddl_u8(x3, x4));
+                        accl = vmlal_lane_s16(accl, vget_low_s16(y2), k, 2);
+                        acch = vmlal_lane_s16(acch, vget_high_s16(y2), k, 2);
+
+                        vst1q_s32((int32_t *)(dst + i), accl);
+                        vst1q_s32((int32_t *)(dst + i + 4), acch);
+                    }
                 }
             }
         }

From 969a218057d4b8b68d5b985d54553029262cb99a Mon Sep 17 00:00:00 2001
From: orestis <orestis@ee.auth.gr>
Date: Fri, 19 Dec 2014 22:11:52 +0200
Subject: [PATCH 04/20] SymmRowSmallVec_8u32s [-1, 0, 1] NEON speedup: 1.84x
 Auto-vect speedup: 1.2x

---
 modules/imgproc/src/filter.cpp | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/modules/imgproc/src/filter.cpp b/modules/imgproc/src/filter.cpp
index 54c5ec14ee..e810bf59b4 100644
--- a/modules/imgproc/src/filter.cpp
+++ b/modules/imgproc/src/filter.cpp
@@ -2355,7 +2355,21 @@ struct SymmRowSmallVec_8u32s
             {
                 if( kx[0] == 0 && kx[1] == 1 )
                 {
-                    return 0;
+                    uint8x8_t z = vdup_n_u8(0);
+
+                    for( ; i <= width - 8; i += 8, src += 8 )
+                    {
+                        uint8x8_t x0, x1;
+                        x0 = vld1_u8( (uint8_t *) (src - cn) );
+                        x1 = vld1_u8( (uint8_t *) (src + cn) );
+
+                        int16x8_t y0;
+                        y0 = vsubq_s16(vreinterpretq_s16_u16(vaddl_u8(x1, z)),
+                                vreinterpretq_s16_u16(vaddl_u8(x0, z)));
+
+                        vst1q_s32((int32_t *)(dst + i), vmovl_s16(vget_low_s16(y0)));
+                        vst1q_s32((int32_t *)(dst + i + 4), vmovl_s16(vget_high_s16(y0)));
+                    }
                 else
                 {
                     return 0;

From 2e7b9a2c0f3f920caad5d2fc17713c6cc0f579b2 Mon Sep 17 00:00:00 2001
From: orestis <orestis@ee.auth.gr>
Date: Fri, 19 Dec 2014 22:15:37 +0200
Subject: [PATCH 05/20] SymmRowSmallVec_8u32s 1x3 asymmetric NEON speedup:
 1.95x Auto-vect speedup: 1.17x

Test kernel: [-2, 0, 2]
---
 modules/imgproc/src/filter.cpp | 25 ++++++++++++++++++++++++-
 1 file changed, 24 insertions(+), 1 deletion(-)

diff --git a/modules/imgproc/src/filter.cpp b/modules/imgproc/src/filter.cpp
index e810bf59b4..55cde48742 100644
--- a/modules/imgproc/src/filter.cpp
+++ b/modules/imgproc/src/filter.cpp
@@ -2370,9 +2370,32 @@ struct SymmRowSmallVec_8u32s
                         vst1q_s32((int32_t *)(dst + i), vmovl_s16(vget_low_s16(y0)));
                         vst1q_s32((int32_t *)(dst + i + 4), vmovl_s16(vget_high_s16(y0)));
                     }
+                }
                 else
                 {
-                    return 0;
+                    int32x4_t k32 = vdupq_n_s32(0);
+                    k32 = vld1q_lane_s32(kx + 1, k32, 1);
+
+                    int16x4_t k = vqmovn_s32(k32);
+
+                    uint8x8_t z = vdup_n_u8(0);
+
+                    for( ; i <= width - 8; i += 8, src += 8 )
+                    {
+                        uint8x8_t x0, x1;
+                        x0 = vld1_u8( (uint8_t *) (src - cn) );
+                        x1 = vld1_u8( (uint8_t *) (src + cn) );
+
+                        int16x8_t y0;
+                        int32x4_t y1, y2;
+                        y0 = vsubq_s16(vreinterpretq_s16_u16(vaddl_u8(x1, z)),
+                            vreinterpretq_s16_u16(vaddl_u8(x0, z)));
+                        y1 = vmull_lane_s16(vget_low_s16(y0), k, 1);
+                        y2 = vmull_lane_s16(vget_high_s16(y0), k, 1);
+
+                        vst1q_s32((int32_t *)(dst + i), y1);
+                        vst1q_s32((int32_t *)(dst + i + 4), y2);
+                    }
                 }
             }
             else if( _ksize == 5 )

From 1fb966dc61f9d134fa4fce2e008270c576083489 Mon Sep 17 00:00:00 2001
From: orestis <orestis@ee.auth.gr>
Date: Fri, 19 Dec 2014 22:23:09 +0200
Subject: [PATCH 06/20] SymmRowSmallVec_8u32s 1x5 asymm NEON speedup: 3.14x
 Auto-vect speedup: 1.6x

Test kernel: [-5, -2, 0, 2, 5]
---
 modules/imgproc/src/filter.cpp | 37 ++++++++++++++++++++++++++++++++--
 1 file changed, 35 insertions(+), 2 deletions(-)

diff --git a/modules/imgproc/src/filter.cpp b/modules/imgproc/src/filter.cpp
index 55cde48742..f5987c7848 100644
--- a/modules/imgproc/src/filter.cpp
+++ b/modules/imgproc/src/filter.cpp
@@ -2400,7 +2400,40 @@ struct SymmRowSmallVec_8u32s
             }
             else if( _ksize == 5 )
             {
-                return 0;
+                int32x4_t k32 = vdupq_n_s32(0);
+                k32 = vld1q_lane_s32(kx + 1, k32, 1);
+                k32 = vld1q_lane_s32(kx + 2, k32, 2);
+
+                int16x4_t k = vqmovn_s32(k32);
+
+                uint8x8_t z = vdup_n_u8(0);
+
+                for( ; i <= width - 8; i += 8, src += 8 )
+                {
+                    uint8x8_t x0, x1;
+                    x0 = vld1_u8( (uint8_t *) (src - cn) );
+                    x1 = vld1_u8( (uint8_t *) (src + cn) );
+
+                    int32x4_t accl, acch;
+                    int16x8_t y0;
+                    y0 = vsubq_s16(vreinterpretq_s16_u16(vaddl_u8(x1, z)),
+                        vreinterpretq_s16_u16(vaddl_u8(x0, z)));
+                    accl = vmull_lane_s16(vget_low_s16(y0), k, 1);
+                    acch = vmull_lane_s16(vget_high_s16(y0), k, 1);
+
+                    uint8x8_t x2, x3;
+                    x2 = vld1_u8( (uint8_t *) (src - cn*2) );
+                    x3 = vld1_u8( (uint8_t *) (src + cn*2) );
+
+                    int16x8_t y1;
+                    y1 = vsubq_s16(vreinterpretq_s16_u16(vaddl_u8(x3, z)),
+                        vreinterpretq_s16_u16(vaddl_u8(x2, z)));
+                    accl = vmlal_lane_s16(accl, vget_low_s16(y1), k, 2);
+                    acch = vmlal_lane_s16(acch, vget_high_s16(y1), k, 2);
+
+                    vst1q_s32((int32_t *)(dst + i), accl);
+                    vst1q_s32((int32_t *)(dst + i + 4), acch);
+                }
             }
         }
 
@@ -2413,9 +2446,9 @@ struct SymmRowSmallVec_8u32s
 };
 
 
+typedef RowNoVec RowVec_8u32s;
 typedef RowNoVec RowVec_16s32f;
 typedef RowNoVec RowVec_32f;
-typedef SymmRowSmallNoVec SymmRowSmallVec_8u32s;
 typedef SymmRowSmallNoVec SymmRowSmallVec_32f;
 typedef ColumnNoVec SymmColumnVec_32s8u;
 typedef ColumnNoVec SymmColumnVec_32f16s;

From 4f5916f12dd782e2a1228f7b5c12e0c320cf3185 Mon Sep 17 00:00:00 2001
From: orestis <orestis@ee.auth.gr>
Date: Fri, 19 Dec 2014 22:26:41 +0200
Subject: [PATCH 07/20] SymmColumnVec_32s8u NEON speedup: 1.96x Auto-vect
 speedup: 1x

Test kernel: [0.0708, 0.2445, 0.3694, 0.2445, 0.0708]
---
 modules/imgproc/src/filter.cpp | 106 ++++++++++++++++++++++++++++++++-
 1 file changed, 105 insertions(+), 1 deletion(-)

diff --git a/modules/imgproc/src/filter.cpp b/modules/imgproc/src/filter.cpp
index f5987c7848..7d41690256 100644
--- a/modules/imgproc/src/filter.cpp
+++ b/modules/imgproc/src/filter.cpp
@@ -2446,11 +2446,115 @@ struct SymmRowSmallVec_8u32s
 };
 
 
+struct SymmColumnVec_32s8u
+{
+    SymmColumnVec_32s8u() { symmetryType=0; }
+    SymmColumnVec_32s8u(const Mat& _kernel, int _symmetryType, int _bits, double _delta)
+    {
+        symmetryType = _symmetryType;
+        _kernel.convertTo(kernel, CV_32F, 1./(1 << _bits), 0);
+        delta = (float)(_delta/(1 << _bits));
+        CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 );
+    }
+
+    int operator()(const uchar** _src, uchar* dst, int width) const
+    {
+        //Uncomment the two following lines when runtime support for neon is implemented.
+        // if( !checkHardwareSupport(CV_CPU_NEON) )
+        //     return 0;
+
+        int _ksize = kernel.rows + kernel.cols - 1;
+        int ksize2 = _ksize / 2;
+        const float* ky = kernel.ptr<float>() + ksize2;
+        int i = 0, k;
+        bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
+        const int** src = (const int**)_src;
+        const int *S, *S2;
+
+        float32x4_t d4 = vdupq_n_f32(delta);
+
+        if( symmetrical )
+        {
+            if( _ksize == 1 )
+                return 0;
+
+
+            float32x2_t k32;
+            k32 = vdup_n_f32(0);
+            k32 = vld1_lane_f32(ky, k32, 0);
+            k32 = vld1_lane_f32(ky + 1, k32, 1);
+
+            for( ; i <= width - 8; i += 8 )
+            {
+                float32x4_t accl, acch;
+                float32x4_t f0l, f0h, f1l, f1h, f2l, f2h;
+
+                S = src[0] + i;
+
+                f0l = vcvtq_f32_s32( vld1q_s32(S) );
+                f0h = vcvtq_f32_s32( vld1q_s32(S + 4) );
+
+                S = src[1] + i;
+                S2 = src[-1] + i;
+
+                f1l = vcvtq_f32_s32( vld1q_s32(S) );
+                f1h = vcvtq_f32_s32( vld1q_s32(S + 4) );
+                f2l = vcvtq_f32_s32( vld1q_s32(S2) );
+                f2h = vcvtq_f32_s32( vld1q_s32(S2 + 4) );
+
+                accl = acch = d4;
+                accl = vmlaq_lane_f32(accl, f0l, k32, 0);
+                acch = vmlaq_lane_f32(acch, f0h, k32, 0);
+                accl = vmlaq_lane_f32(accl, vaddq_f32(f1l, f2l), k32, 1);
+                acch = vmlaq_lane_f32(acch, vaddq_f32(f1h, f2h), k32, 1);
+
+                for( k = 2; k <= ksize2; k++ )
+                {
+                    S = src[k] + i;
+                    S2 = src[-k] + i;
+
+                    float32x4_t f3l, f3h, f4l, f4h;
+                    f3l = vcvtq_f32_s32( vld1q_s32(S) );
+                    f3h = vcvtq_f32_s32( vld1q_s32(S + 4) );
+                    f4l = vcvtq_f32_s32( vld1q_s32(S2) );
+                    f4h = vcvtq_f32_s32( vld1q_s32(S2 + 4) );
+
+                    accl = vmlaq_n_f32(accl, vaddq_f32(f3l, f4l), ky[k]);
+                    acch = vmlaq_n_f32(acch, vaddq_f32(f3h, f4h), ky[k]);
+                }
+
+                int32x4_t s32l, s32h;
+                s32l = vcvtq_s32_f32(accl);
+                s32h = vcvtq_s32_f32(acch);
+
+                int16x4_t s16l, s16h;
+                s16l = vqmovn_s32(s32l);
+                s16h = vqmovn_s32(s32h);
+
+                uint8x8_t u8;
+                u8 =  vqmovun_s16(vcombine_s16(s16l, s16h));
+
+                vst1_u8((uint8_t *)(dst + i), u8);
+            }
+        }
+        else
+        {
+            return 0;
+        }
+
+        return i;
+    }
+
+    int symmetryType;
+    float delta;
+    Mat kernel;
+};
+
+
 typedef RowNoVec RowVec_8u32s;
 typedef RowNoVec RowVec_16s32f;
 typedef RowNoVec RowVec_32f;
 typedef SymmRowSmallNoVec SymmRowSmallVec_32f;
-typedef ColumnNoVec SymmColumnVec_32s8u;
 typedef ColumnNoVec SymmColumnVec_32f16s;
 typedef ColumnNoVec SymmColumnVec_32f;
 typedef SymmColumnSmallNoVec SymmColumnSmallVec_32s16s;

From 80a03644659788fcd77c8a5b40a2134753b1246f Mon Sep 17 00:00:00 2001
From: orestis <orestis@ee.auth.gr>
Date: Fri, 19 Dec 2014 22:29:54 +0200
Subject: [PATCH 08/20] SymmColumnVec_32s8u asymm NEON speedup: 2.95x Auto-vect
 speedup: 1x

Test kernel: [-0.9432, -1.1528, 0, 1.1528, 0.9432]
---
 modules/imgproc/src/filter.cpp | 50 +++++++++++++++++++++++++++++++++-
 1 file changed, 49 insertions(+), 1 deletion(-)

diff --git a/modules/imgproc/src/filter.cpp b/modules/imgproc/src/filter.cpp
index 7d41690256..08de1f1471 100644
--- a/modules/imgproc/src/filter.cpp
+++ b/modules/imgproc/src/filter.cpp
@@ -2539,7 +2539,55 @@ struct SymmColumnVec_32s8u
         }
         else
         {
-            return 0;
+            float32x2_t k32;
+            k32 = vdup_n_f32(0);
+            k32 = vld1_lane_f32(ky + 1, k32, 1);
+
+            for( ; i <= width - 8; i += 8 )
+            {
+                float32x4_t accl, acch;
+                float32x4_t f1l, f1h, f2l, f2h;
+
+                S = src[1] + i;
+                S2 = src[-1] + i;
+
+                f1l = vcvtq_f32_s32( vld1q_s32(S) );
+                f1h = vcvtq_f32_s32( vld1q_s32(S + 4) );
+                f2l = vcvtq_f32_s32( vld1q_s32(S2) );
+                f2h = vcvtq_f32_s32( vld1q_s32(S2 + 4) );
+
+                accl = acch = d4;
+                accl = vmlaq_lane_f32(accl, vsubq_f32(f1l, f2l), k32, 1);
+                acch = vmlaq_lane_f32(acch, vsubq_f32(f1h, f2h), k32, 1);
+
+                for( k = 2; k <= ksize2; k++ )
+                {
+                    S = src[k] + i;
+                    S2 = src[-k] + i;
+
+                    float32x4_t f3l, f3h, f4l, f4h;
+                    f3l = vcvtq_f32_s32( vld1q_s32(S) );
+                    f3h = vcvtq_f32_s32( vld1q_s32(S + 4) );
+                    f4l = vcvtq_f32_s32( vld1q_s32(S2) );
+                    f4h = vcvtq_f32_s32( vld1q_s32(S2 + 4) );
+
+                    accl = vmlaq_n_f32(accl, vsubq_f32(f3l, f4l), ky[k]);
+                    acch = vmlaq_n_f32(acch, vsubq_f32(f3h, f4h), ky[k]);
+                }
+
+                int32x4_t s32l, s32h;
+                s32l = vcvtq_s32_f32(accl);
+                s32h = vcvtq_s32_f32(acch);
+
+                int16x4_t s16l, s16h;
+                s16l = vqmovn_s32(s32l);
+                s16h = vqmovn_s32(s32h);
+
+                uint8x8_t u8;
+                u8 =  vqmovun_s16(vcombine_s16(s16l, s16h));
+
+                vst1_u8((uint8_t *)(dst + i), u8);
+            }
         }
 
         return i;

From 4f906372e284937d9f7f5e95b04a726cffe1f986 Mon Sep 17 00:00:00 2001
From: orestis <orestis@ee.auth.gr>
Date: Fri, 19 Dec 2014 22:33:11 +0200
Subject: [PATCH 09/20] SymmColumnSmallVec_32s16s [1, 2, 1] NEON speedup: 2.66x
 Auto-vect speedup: 1x

---
 modules/imgproc/src/filter.cpp | 85 +++++++++++++++++++++++++++++++++-
 1 file changed, 84 insertions(+), 1 deletion(-)

diff --git a/modules/imgproc/src/filter.cpp b/modules/imgproc/src/filter.cpp
index 08de1f1471..bcf8631759 100644
--- a/modules/imgproc/src/filter.cpp
+++ b/modules/imgproc/src/filter.cpp
@@ -2599,13 +2599,96 @@ struct SymmColumnVec_32s8u
 };
 
 
+struct SymmColumnSmallVec_32s16s
+{
+    SymmColumnSmallVec_32s16s() { symmetryType=0; }
+    SymmColumnSmallVec_32s16s(const Mat& _kernel, int _symmetryType, int _bits, double _delta)
+    {
+        symmetryType = _symmetryType;
+        _kernel.convertTo(kernel, CV_32F, 1./(1 << _bits), 0);
+        delta = (float)(_delta/(1 << _bits));
+        CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 );
+    }
+
+    int operator()(const uchar** _src, uchar* _dst, int width) const
+    {
+        //Uncomment the two following lines when runtime support for neon is implemented.
+        // if( !checkHardwareSupport(CV_CPU_NEON) )
+        //     return 0;
+
+        int ksize2 = (kernel.rows + kernel.cols - 1)/2;
+        const float* ky = kernel.ptr<float>() + ksize2;
+        int i = 0;
+        bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
+        const int** src = (const int**)_src;
+        const int *S0 = src[-1], *S1 = src[0], *S2 = src[1];
+        short* dst = (short*)_dst;
+        float32x4_t df4 = vdupq_n_f32(delta);
+        int32x4_t d4 = vcvtq_s32_f32(df4);
+
+        if( symmetrical )
+        {
+            if( ky[0] == 2 && ky[1] == 1 )
+            {
+                for( ; i <= width - 4; i += 4 )
+                {
+                    int32x4_t x0, x1, x2;
+                    x0 = vld1q_s32((int32_t const *)(S0 + i));
+                    x1 = vld1q_s32((int32_t const *)(S1 + i));
+                    x2 = vld1q_s32((int32_t const *)(S2 + i));
+
+                    int32x4_t y0, y1, y2, y3;
+                    y0 = vaddq_s32(x0, x2);
+                    y1 = vqshlq_n_s32(x1, 1);
+                    y2 = vaddq_s32(y0, y1);
+                    y3 = vaddq_s32(y2, d4);
+
+                    int16x4_t t;
+                    t = vqmovn_s32(y3);
+
+                    vst1_s16((int16_t *)(dst + i), t);
+                }
+            }
+            else if( ky[0] == -2 && ky[1] == 1 )
+            {
+                return 0;
+            }
+            else if( ky[0] == 10 && ky[1] == 3 )
+            {
+                return 0;
+            }
+            else
+            {
+                return 0;
+            }
+        }
+        else
+        {
+            if( fabs(ky[1]) == 1 && ky[1] == -ky[-1] )
+            {
+                return 0;
+            }
+            else
+            {
+                return 0;
+            }
+        }
+
+        return i;
+    }
+
+    int symmetryType;
+    float delta;
+    Mat kernel;
+};
+
+
 typedef RowNoVec RowVec_8u32s;
 typedef RowNoVec RowVec_16s32f;
 typedef RowNoVec RowVec_32f;
 typedef SymmRowSmallNoVec SymmRowSmallVec_32f;
 typedef ColumnNoVec SymmColumnVec_32f16s;
 typedef ColumnNoVec SymmColumnVec_32f;
-typedef SymmColumnSmallNoVec SymmColumnSmallVec_32s16s;
 typedef SymmColumnSmallNoVec SymmColumnSmallVec_32f;
 typedef FilterNoVec FilterVec_8u;
 typedef FilterNoVec FilterVec_8u16s;

From 61a7f48bf4d0794a6187b46491b74694f685e6fa Mon Sep 17 00:00:00 2001
From: orestis <orestis@ee.auth.gr>
Date: Fri, 19 Dec 2014 22:34:11 +0200
Subject: [PATCH 10/20] SymmColumnSmallVec_32s16s [1, -2, 1] NEON speedup:
 2.75x Auto-vect speedup: 1.01x

---
 modules/imgproc/src/filter.cpp | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/modules/imgproc/src/filter.cpp b/modules/imgproc/src/filter.cpp
index bcf8631759..8e8e0e498c 100644
--- a/modules/imgproc/src/filter.cpp
+++ b/modules/imgproc/src/filter.cpp
@@ -2651,7 +2651,24 @@ struct SymmColumnSmallVec_32s16s
             }
             else if( ky[0] == -2 && ky[1] == 1 )
             {
-                return 0;
+                for( ; i <= width - 4; i += 4 )
+                {
+                    int32x4_t x0, x1, x2;
+                    x0 = vld1q_s32((int32_t const *)(S0 + i));
+                    x1 = vld1q_s32((int32_t const *)(S1 + i));
+                    x2 = vld1q_s32((int32_t const *)(S2 + i));
+
+                    int32x4_t y0, y1, y2, y3;
+                    y0 = vaddq_s32(x0, x2);
+                    y1 = vqshlq_n_s32(x1, 1);
+                    y2 = vsubq_s32(y0, y1);
+                    y3 = vaddq_s32(y2, d4);
+
+                    int16x4_t t;
+                    t = vqmovn_s32(y3);
+
+                    vst1_s16((int16_t *)(dst + i), t);
+                }
             }
             else if( ky[0] == 10 && ky[1] == 3 )
             {

From 33dfeb85be5e95820afe19243bb98093d946d623 Mon Sep 17 00:00:00 2001
From: orestis <orestis@ee.auth.gr>
Date: Fri, 19 Dec 2014 22:35:52 +0200
Subject: [PATCH 11/20] SymmColumnSmallVec_32s16s [3, 10, 3] Scharr NEON
 speedup: 2.04x Auto-vect speedup: 1x

---
 modules/imgproc/src/filter.cpp | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/modules/imgproc/src/filter.cpp b/modules/imgproc/src/filter.cpp
index 8e8e0e498c..9da0a2181c 100644
--- a/modules/imgproc/src/filter.cpp
+++ b/modules/imgproc/src/filter.cpp
@@ -2672,7 +2672,24 @@ struct SymmColumnSmallVec_32s16s
             }
             else if( ky[0] == 10 && ky[1] == 3 )
             {
-                return 0;
+                for( ; i <= width - 4; i += 4 )
+                {
+                    int32x4_t x0, x1, x2, x3;
+                    x0 = vld1q_s32((int32_t const *)(S0 + i));
+                    x1 = vld1q_s32((int32_t const *)(S1 + i));
+                    x2 = vld1q_s32((int32_t const *)(S2 + i));
+
+                    x3 = vaddq_s32(x0, x2);
+
+                    int32x4_t y0;
+                    y0 = vmlaq_n_s32(d4, x1, 10);
+                    y0 = vmlaq_n_s32(y0, x3, 3);
+
+                    int16x4_t t;
+                    t = vqmovn_s32(y0);
+
+                    vst1_s16((int16_t *)(dst + i), t);
+                }
             }
             else
             {

From 99e782e62caa2d130eb09de9c5beb2fe2b9bf106 Mon Sep 17 00:00:00 2001
From: orestis <orestis@ee.auth.gr>
Date: Fri, 19 Dec 2014 22:36:46 +0200
Subject: [PATCH 12/20] SymmColumnSmallVec_32s16s 3x1 NEON speedup: 1.75x
 Auto-vect speedup: 1x

---
 modules/imgproc/src/filter.cpp | 27 ++++++++++++++++++++++++++-
 1 file changed, 26 insertions(+), 1 deletion(-)

diff --git a/modules/imgproc/src/filter.cpp b/modules/imgproc/src/filter.cpp
index 9da0a2181c..7c2bd57526 100644
--- a/modules/imgproc/src/filter.cpp
+++ b/modules/imgproc/src/filter.cpp
@@ -2693,7 +2693,32 @@ struct SymmColumnSmallVec_32s16s
             }
             else
             {
-                return 0;
+                float32x2_t k32 = vdup_n_f32(0);
+                k32 = vld1_lane_f32(ky, k32, 0);
+                k32 = vld1_lane_f32(ky + 1, k32, 1);
+
+                for( ; i <= width - 4; i += 4 )
+                {
+                    int32x4_t x0, x1, x2, x3, x4;
+                    x0 = vld1q_s32((int32_t const *)(S0 + i));
+                    x1 = vld1q_s32((int32_t const *)(S1 + i));
+                    x2 = vld1q_s32((int32_t const *)(S2 + i));
+
+                    x3 = vaddq_s32(x0, x2);
+
+                    float32x4_t s0, s1, s2;
+                    s0 = vcvtq_f32_s32(x1);
+                    s1 = vcvtq_f32_s32(x3);
+                    s2 = vmlaq_lane_f32(df4, s0, k32, 0);
+                    s2 = vmlaq_lane_f32(s2, s1, k32, 1);
+
+                    x4 = vcvtq_s32_f32(s2);
+
+                    int16x4_t x5;
+                    x5 = vqmovn_s32(x4);
+
+                    vst1_s16((int16_t *)(dst + i), x5);
+                }
             }
         }
         else

From 4443d6b0a1ab5932f8a878b474108b88156ab9c5 Mon Sep 17 00:00:00 2001
From: orestis <orestis@ee.auth.gr>
Date: Fri, 19 Dec 2014 22:37:52 +0200
Subject: [PATCH 13/20] SymmColumnSmallVec_32s16s [-1, 0, 1] NEON speedup:
 3.27x Auto-vect speedup: 1.01x

---
 modules/imgproc/src/filter.cpp | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/modules/imgproc/src/filter.cpp b/modules/imgproc/src/filter.cpp
index 7c2bd57526..de8090cd19 100644
--- a/modules/imgproc/src/filter.cpp
+++ b/modules/imgproc/src/filter.cpp
@@ -2725,7 +2725,23 @@ struct SymmColumnSmallVec_32s16s
         {
             if( fabs(ky[1]) == 1 && ky[1] == -ky[-1] )
             {
-                return 0;
+                if( ky[1] < 0 )
+                    std::swap(S0, S2);
+                for( ; i <= width - 4; i += 4 )
+                {
+                    int32x4_t x0, x1;
+                    x0 = vld1q_s32((int32_t const *)(S0 + i));
+                    x1 = vld1q_s32((int32_t const *)(S2 + i));
+
+                    int32x4_t y0, y1;
+                    y0 = vsubq_s32(x1, x0);
+                    y1 = vqaddq_s32(y0, d4);
+
+                    int16x4_t t;
+                    t = vqmovn_s32(y1);
+
+                    vst1_s16((int16_t *)(dst + i), t);
+                }
             }
             else
             {

From 37e018454d6b4f3f72320e607348190ed10a8252 Mon Sep 17 00:00:00 2001
From: orestis <orestis@ee.auth.gr>
Date: Fri, 19 Dec 2014 22:40:55 +0200
Subject: [PATCH 14/20] SymmColumnSmallVec_32s16s 3x1 asymm NEON speedup: 2.12x
 Auto-vect speedup: 1.01x

Test kernel: [-2, 0, 2]
---
 modules/imgproc/src/filter.cpp | 23 ++++++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

diff --git a/modules/imgproc/src/filter.cpp b/modules/imgproc/src/filter.cpp
index de8090cd19..f60558c6d7 100644
--- a/modules/imgproc/src/filter.cpp
+++ b/modules/imgproc/src/filter.cpp
@@ -2745,7 +2745,28 @@ struct SymmColumnSmallVec_32s16s
             }
             else
             {
-                return 0;
+                float32x2_t k32 = vdup_n_f32(0);
+                k32 = vld1_lane_f32(ky + 1, k32, 1);
+
+                for( ; i <= width - 4; i += 4 )
+                {
+                    int32x4_t x0, x1, x2, x3;
+                    x0 = vld1q_s32((int32_t const *)(S0 + i));
+                    x1 = vld1q_s32((int32_t const *)(S2 + i));
+
+                    x2 = vsubq_s32(x1, x0);
+
+                    float32x4_t s0, s1;
+                    s0 = vcvtq_f32_s32(x2);
+                    s1 = vmlaq_lane_f32(df4, s0, k32, 1);
+
+                    x3 = vcvtq_s32_f32(s1);
+
+                    int16x4_t x4;
+                    x4 = vqmovn_s32(x3);
+
+                    vst1_s16((int16_t *)(dst + i), x4);
+                }
             }
         }
 

From a2a131799fe9a7d3ba21dab00bb1056c51095a5d Mon Sep 17 00:00:00 2001
From: orestis <orestis@ee.auth.gr>
Date: Fri, 19 Dec 2014 22:42:31 +0200
Subject: [PATCH 15/20] SymmColumnVec_32f16s NEON speedup: 8.64x Auto-vect
 speedup: 1x

Test kernel: [0.1, 0.2408, 0.3184, 0.2408, 0.1]
---
 modules/imgproc/src/filter.cpp | 108 ++++++++++++++++++++++++++++++++-
 1 file changed, 107 insertions(+), 1 deletion(-)

diff --git a/modules/imgproc/src/filter.cpp b/modules/imgproc/src/filter.cpp
index f60558c6d7..6991a448cc 100644
--- a/modules/imgproc/src/filter.cpp
+++ b/modules/imgproc/src/filter.cpp
@@ -2779,11 +2779,117 @@ struct SymmColumnSmallVec_32s16s
 };
 
 
+struct SymmColumnVec_32f16s
+{
+    SymmColumnVec_32f16s() { symmetryType=0; }
+    SymmColumnVec_32f16s(const Mat& _kernel, int _symmetryType, int, double _delta)
+    {
+        symmetryType = _symmetryType;
+        kernel = _kernel;
+        delta = (float)_delta;
+        CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 );
+        //Uncomment the following line when runtime support for neon is implemented.
+        // neon_supported = checkHardwareSupport(CV_CPU_NEON);
+    }
+
+    int operator()(const uchar** _src, uchar* _dst, int width) const
+    {
+        //Uncomment the two following lines when runtime support for neon is implemented.
+        // if( !neon_supported )
+        //     return 0;
+
+        int _ksize = kernel.rows + kernel.cols - 1;
+        int ksize2 = _ksize / 2;
+        const float* ky = kernel.ptr<float>() + ksize2;
+        int i = 0, k;
+        bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
+        const float** src = (const float**)_src;
+        const float *S, *S2;
+        short* dst = (short*)_dst;
+
+        float32x4_t d4 = vdupq_n_f32(delta);
+
+        if( symmetrical )
+        {
+            if( _ksize == 1 )
+                return 0;
+
+
+            float32x2_t k32;
+            k32 = vdup_n_f32(0);
+            k32 = vld1_lane_f32(ky, k32, 0);
+            k32 = vld1_lane_f32(ky + 1, k32, 1);
+
+            for( ; i <= width - 8; i += 8 )
+            {
+                float32x4_t x0l, x0h, x1l, x1h, x2l, x2h;
+                float32x4_t accl, acch;
+
+                S = src[0] + i;
+
+                x0l = vld1q_f32(S);
+                x0h = vld1q_f32(S + 4);
+
+                S = src[1] + i;
+                S2 = src[-1] + i;
+
+                x1l = vld1q_f32(S);
+                x1h = vld1q_f32(S + 4);
+                x2l = vld1q_f32(S2);
+                x2h = vld1q_f32(S2 + 4);
+
+                accl = acch = d4;
+                accl = vmlaq_lane_f32(accl, x0l, k32, 0);
+                acch = vmlaq_lane_f32(acch, x0h, k32, 0);
+                accl = vmlaq_lane_f32(accl, vaddq_f32(x1l, x2l), k32, 1);
+                acch = vmlaq_lane_f32(acch, vaddq_f32(x1h, x2h), k32, 1);
+
+                for( k = 2; k <= ksize2; k++ )
+                {
+                    S = src[k] + i;
+                    S2 = src[-k] + i;
+
+                    float32x4_t x3l, x3h, x4l, x4h;
+                    x3l = vld1q_f32(S);
+                    x3h = vld1q_f32(S + 4);
+                    x4l = vld1q_f32(S2);
+                    x4h = vld1q_f32(S2 + 4);
+
+                    accl = vmlaq_n_f32(accl, vaddq_f32(x3l, x4l), ky[k]);
+                    acch = vmlaq_n_f32(acch, vaddq_f32(x3h, x4h), ky[k]);
+                }
+
+                int32x4_t s32l, s32h;
+                s32l = vcvtq_s32_f32(accl);
+                s32h = vcvtq_s32_f32(acch);
+
+                int16x4_t s16l, s16h;
+                s16l = vqmovn_s32(s32l);
+                s16h = vqmovn_s32(s32h);
+
+                vst1_s16((int16_t *)(dst + i), s16l);
+                vst1_s16((int16_t *)(dst + i + 4), s16h);
+            }
+        }
+        else
+        {
+            return 0;
+        }
+
+        return i;
+    }
+
+    int symmetryType;
+    float delta;
+    Mat kernel;
+    bool neon_supported;
+};
+
+
 typedef RowNoVec RowVec_8u32s;
 typedef RowNoVec RowVec_16s32f;
 typedef RowNoVec RowVec_32f;
 typedef SymmRowSmallNoVec SymmRowSmallVec_32f;
-typedef ColumnNoVec SymmColumnVec_32f16s;
 typedef ColumnNoVec SymmColumnVec_32f;
 typedef SymmColumnSmallNoVec SymmColumnSmallVec_32f;
 typedef FilterNoVec FilterVec_8u;

From ed0ce48179663a89e1458a705ec090566c019079 Mon Sep 17 00:00:00 2001
From: orestis <orestis@ee.auth.gr>
Date: Fri, 19 Dec 2014 22:44:39 +0200
Subject: [PATCH 16/20] SymmColumnVec_32f16s asymm NEON speedup: 9.46x
 Auto-vect speedup: 1x

Test kernel: [-0.9432, -1.1528, 0, 1.1528, 0.9432]
---
 modules/imgproc/src/filter.cpp | 48 +++++++++++++++++++++++++++++++++-
 1 file changed, 47 insertions(+), 1 deletion(-)

diff --git a/modules/imgproc/src/filter.cpp b/modules/imgproc/src/filter.cpp
index 6991a448cc..d0516f56f4 100644
--- a/modules/imgproc/src/filter.cpp
+++ b/modules/imgproc/src/filter.cpp
@@ -2873,7 +2873,53 @@ struct SymmColumnVec_32f16s
         }
         else
         {
-            return 0;
+            float32x2_t k32;
+            k32 = vdup_n_f32(0);
+            k32 = vld1_lane_f32(ky + 1, k32, 1);
+
+            for( ; i <= width - 8; i += 8 )
+            {
+                float32x4_t x1l, x1h, x2l, x2h;
+                float32x4_t accl, acch;
+
+                S = src[1] + i;
+                S2 = src[-1] + i;
+
+                x1l = vld1q_f32(S);
+                x1h = vld1q_f32(S + 4);
+                x2l = vld1q_f32(S2);
+                x2h = vld1q_f32(S2 + 4);
+
+                accl = acch = d4;
+                accl = vmlaq_lane_f32(accl, vsubq_f32(x1l, x2l), k32, 1);
+                acch = vmlaq_lane_f32(acch, vsubq_f32(x1h, x2h), k32, 1);
+
+                for( k = 2; k <= ksize2; k++ )
+                {
+                    S = src[k] + i;
+                    S2 = src[-k] + i;
+
+                    float32x4_t x3l, x3h, x4l, x4h;
+                    x3l = vld1q_f32(S);
+                    x3h = vld1q_f32(S + 4);
+                    x4l = vld1q_f32(S2);
+                    x4h = vld1q_f32(S2 + 4);
+
+                    accl = vmlaq_n_f32(accl, vsubq_f32(x3l, x4l), ky[k]);
+                    acch = vmlaq_n_f32(acch, vsubq_f32(x3h, x4h), ky[k]);
+                }
+
+                int32x4_t s32l, s32h;
+                s32l = vcvtq_s32_f32(accl);
+                s32h = vcvtq_s32_f32(acch);
+
+                int16x4_t s16l, s16h;
+                s16l = vqmovn_s32(s32l);
+                s16h = vqmovn_s32(s32h);
+
+                vst1_s16((int16_t *)(dst + i), s16l);
+                vst1_s16((int16_t *)(dst + i + 4), s16h);
+            }
         }
 
         return i;

From 13c08551146f9d7c614a15742a3b6324807e9691 Mon Sep 17 00:00:00 2001
From: orestis <orestis@ee.auth.gr>
Date: Fri, 19 Dec 2014 22:47:06 +0200
Subject: [PATCH 17/20] SymmRowSmallVec_32f 1x5 NEON speedup: 2.36x Auto-vect
 speedup: 2.36x

Test kernel: [0.1, 0.2408, 0.3184, 0.2408, 0.1]
---
 modules/imgproc/src/filter.cpp | 94 +++++++++++++++++++++++++++++++++-
 1 file changed, 93 insertions(+), 1 deletion(-)

diff --git a/modules/imgproc/src/filter.cpp b/modules/imgproc/src/filter.cpp
index d0516f56f4..a615368479 100644
--- a/modules/imgproc/src/filter.cpp
+++ b/modules/imgproc/src/filter.cpp
@@ -2932,10 +2932,102 @@ struct SymmColumnVec_32f16s
 };
 
 
+struct SymmRowSmallVec_32f
+{
+    SymmRowSmallVec_32f() {}
+    SymmRowSmallVec_32f( const Mat& _kernel, int _symmetryType )
+    {
+        kernel = _kernel;
+        symmetryType = _symmetryType;
+    }
+
+    int operator()(const uchar* _src, uchar* _dst, int width, int cn) const
+    {
+        //Uncomment the two following lines when runtime support for neon is implemented.
+        // if( !checkHardwareSupport(CV_CPU_NEON) )
+        //     return 0;
+
+        int i = 0, _ksize = kernel.rows + kernel.cols - 1;
+        float* dst = (float*)_dst;
+        const float* src = (const float*)_src + (_ksize/2)*cn;
+        bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
+        const float* kx = kernel.ptr<float>() + _ksize/2;
+        width *= cn;
+
+        if( symmetrical )
+        {
+            if( _ksize == 1 )
+                return 0;
+            if( _ksize == 3 )
+            {
+                if( kx[0] == 2 && kx[1] == 1 )
+                    return 0;
+                else if( kx[0] == -2 && kx[1] == 1 )
+                    return 0;
+                else
+                {
+                    return 0;
+                }
+            }
+            else if( _ksize == 5 )
+            {
+                if( kx[0] == -2 && kx[1] == 0 && kx[2] == 1 )
+                    return 0;
+                else
+                {
+                    float32x2_t k0, k1;
+                    k0 = k1 = vdup_n_f32(0);
+                    k0 = vld1_lane_f32(kx + 0, k0, 0);
+                    k0 = vld1_lane_f32(kx + 1, k0, 1);
+                    k1 = vld1_lane_f32(kx + 2, k1, 0);
+
+                    for( ; i <= width - 4; i += 4, src += 4 )
+                    {
+                        float32x4_t x0, x1, x2, x3, x4;
+                        x0 = vld1q_f32(src);
+                        x1 = vld1q_f32(src - cn);
+                        x2 = vld1q_f32(src + cn);
+                        x3 = vld1q_f32(src - cn*2);
+                        x4 = vld1q_f32(src + cn*2);
+
+                        float32x4_t y0;
+                        y0 = vmulq_lane_f32(x0, k0, 0);
+                        y0 = vmlaq_lane_f32(y0, vaddq_f32(x1, x2), k0, 1);
+                        y0 = vmlaq_lane_f32(y0, vaddq_f32(x3, x4), k1, 0);
+
+                        vst1q_f32(dst + i, y0);
+                    }
+                }
+            }
+        }
+        else
+        {
+            if( _ksize == 3 )
+            {
+                if( kx[0] == 0 && kx[1] == 1 )
+                    return 0;
+                else
+                {
+                    return 0;
+                }
+            }
+            else if( _ksize == 5 )
+            {
+                return 0;
+            }
+        }
+
+        return i;
+    }
+
+    Mat kernel;
+    int symmetryType;
+};
+
+
 typedef RowNoVec RowVec_8u32s;
 typedef RowNoVec RowVec_16s32f;
 typedef RowNoVec RowVec_32f;
-typedef SymmRowSmallNoVec SymmRowSmallVec_32f;
 typedef ColumnNoVec SymmColumnVec_32f;
 typedef SymmColumnSmallNoVec SymmColumnSmallVec_32f;
 typedef FilterNoVec FilterVec_8u;

From 9c6da035049c7aa1d611fbbd8b8fb1a84406a2c4 Mon Sep 17 00:00:00 2001
From: orestis <orestis@ee.auth.gr>
Date: Fri, 19 Dec 2014 22:51:42 +0200
Subject: [PATCH 18/20] SymmRowSmallVec_32f 1x5 asymm NEON speedup: 2.31x
 Auto-vect speedup: 2.26x

Test kernel: [-0.9432, -1.1528, 0, 1.1528, 0.9432]
---
 modules/imgproc/src/filter.cpp | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/modules/imgproc/src/filter.cpp b/modules/imgproc/src/filter.cpp
index a615368479..1e939d0699 100644
--- a/modules/imgproc/src/filter.cpp
+++ b/modules/imgproc/src/filter.cpp
@@ -3013,7 +3013,25 @@ struct SymmRowSmallVec_32f
             }
             else if( _ksize == 5 )
             {
-                return 0;
+                float32x2_t k;
+                k = vdup_n_f32(0);
+                k = vld1_lane_f32(kx + 1, k, 0);
+                k = vld1_lane_f32(kx + 2, k, 1);
+
+                for( ; i <= width - 4; i += 4, src += 4 )
+                {
+                    float32x4_t x0, x1, x2, x3;
+                    x0 = vld1q_f32(src - cn);
+                    x1 = vld1q_f32(src + cn);
+                    x2 = vld1q_f32(src - cn*2);
+                    x3 = vld1q_f32(src + cn*2);
+
+                    float32x4_t y0;
+                    y0 = vmulq_lane_f32(vsubq_f32(x1, x0), k, 0);
+                    y0 = vmlaq_lane_f32(y0, vsubq_f32(x3, x2), k, 1);
+
+                    vst1q_f32(dst + i, y0);
+                }
             }
         }
 

From 9811a739b0dcab45003de7eb3c41f027a6280c41 Mon Sep 17 00:00:00 2001
From: orestis <orestis@ee.auth.gr>
Date: Sat, 20 Dec 2014 17:14:21 +0200
Subject: [PATCH 19/20] Change gaussianBlur5x5 perf test epsilon Set it 1
 instead of 0.001, as is already done in gaussianBlur3x3. That will allow
 integer destination matrices that are not exactly the same, but very close to
 the expected result, to pass the test.

---
 modules/imgproc/perf/perf_blur.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/imgproc/perf/perf_blur.cpp b/modules/imgproc/perf/perf_blur.cpp
index 3fc953ef18..58a0c7cbab 100644
--- a/modules/imgproc/perf/perf_blur.cpp
+++ b/modules/imgproc/perf/perf_blur.cpp
@@ -183,7 +183,7 @@ PERF_TEST_P(Size_MatType_BorderType, gaussianBlur5x5,
 
     TEST_CYCLE() GaussianBlur(src, dst, Size(5,5), 0, 0, btype);
 
-    SANITY_CHECK(dst, 1e-3);
+    SANITY_CHECK(dst, 1);
 }
 
 PERF_TEST_P(Size_MatType_BorderType, blur5x5,

From fffe2464cd77da8ca90de08aa34d186175f342a1 Mon Sep 17 00:00:00 2001
From: orestis <orestis@ee.auth.gr>
Date: Sun, 21 Dec 2014 21:27:03 +0200
Subject: [PATCH 20/20] Change DescriptorExtractor_ORB regression test to
 compensate for neon ieee754 non-compliancy. Also changed the comparison
 between max valid and calculated distance to make the error message more
 accurate (in case curMaxDist == maxDist)

---
 modules/features2d/test/test_descriptors_regression.cpp | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/modules/features2d/test/test_descriptors_regression.cpp b/modules/features2d/test/test_descriptors_regression.cpp
index 04449bdfc7..e40fe9fb11 100644
--- a/modules/features2d/test/test_descriptors_regression.cpp
+++ b/modules/features2d/test/test_descriptors_regression.cpp
@@ -132,7 +132,7 @@ protected:
 
         stringstream ss;
         ss << "Max distance between valid and computed descriptors " << curMaxDist;
-        if( curMaxDist < maxDist )
+        if( curMaxDist <= maxDist )
             ss << "." << endl;
         else
         {
@@ -322,7 +322,11 @@ TEST( Features2d_DescriptorExtractor_ORB, regression )
 {
     // TODO adjust the parameters below
     CV_DescriptorExtractorTest<Hamming> test( "descriptor-orb",
+#if CV_NEON
+                                              (CV_DescriptorExtractorTest<Hamming>::DistanceType)25.f,
+#else
                                               (CV_DescriptorExtractorTest<Hamming>::DistanceType)12.f,
+#endif
                                              ORB::create() );
     test.safe_run();
 }