CV_ENABLE_UNROLLED

13 years ago · fbdb93ec79
parent 68a54e7f3c
commit fbdb93ec79
9 changed files with 178 additions and 73 deletions
--- a/modules/core/include/opencv2/core/internal.hpp
+++ b/modules/core/include/opencv2/core/internal.hpp
@ -131,6 +131,12 @@ CV_INLINE IppiSize ippiSize(int width, int height)
 #define CPU_HAS_NEON_FEATURE (false)
 #endif

+#ifdef CV_ICC
+#define CV_ENABLE_UNROLLED 0
+#else
+#define CV_ENABLE_UNROLLED 1
+#endif
+
 #ifndef IPPI_CALL
 #define IPPI_CALL(func) CV_Assert((func) >= 0)
 #endif
--- a/modules/core/include/opencv2/core/operations.hpp
+++ b/modules/core/include/opencv2/core/operations.hpp
@ -48,6 +48,7 @@
  #include <limits.h>
 #endif // SKIP_INCLUDES

+
 #ifdef __cplusplus

 /////// exchange-add operation for atomic operations on reference counters ///////
@ -903,12 +904,14 @@ template<typename _Tp, typename _AccTp> static inline
 _AccTp normL2Sqr(const _Tp* a, int n)
 {
    _AccTp s = 0;
-    int i;
-    for( i = 0; i <= n - 4; i += 4 )
+    int i=0;
+ #if CV_ENABLE_UNROLLED
+    for( ; i <= n - 4; i += 4 )
    {
        _AccTp v0 = a[i], v1 = a[i+1], v2 = a[i+2], v3 = a[i+3];
        s += v0*v0 + v1*v1 + v2*v2 + v3*v3;
    }
+#endif
    for( ; i < n; i++ )
    {
        _AccTp v = a[i];
@ -922,12 +925,14 @@ template<typename _Tp, typename _AccTp> static inline
 _AccTp normL1(const _Tp* a, int n)
 {
    _AccTp s = 0;
-    int i;
-    for( i = 0; i <= n - 4; i += 4 )
+    int i = 0;
+#if CV_ENABLE_UNROLLED
+    for(; i <= n - 4; i += 4 )
    {
        s += (_AccTp)fast_abs(a[i]) + (_AccTp)fast_abs(a[i+1]) +
            (_AccTp)fast_abs(a[i+2]) + (_AccTp)fast_abs(a[i+3]);
    }
+#endif
    for( ; i < n; i++ )
        s += fast_abs(a[i]);
    return s;
@ -948,12 +953,14 @@ template<typename _Tp, typename _AccTp> static inline
 _AccTp normL2Sqr(const _Tp* a, const _Tp* b, int n)
 {
    _AccTp s = 0;
-    int i;
-    for( i = 0; i <= n - 4; i += 4 )
+    int i= 0;
+ #if CV_ENABLE_UNROLLED
+    for(; i <= n - 4; i += 4 )
    {
        _AccTp v0 = a[i] - b[i], v1 = a[i+1] - b[i+1], v2 = a[i+2] - b[i+2], v3 = a[i+3] - b[i+3];
        s += v0*v0 + v1*v1 + v2*v2 + v3*v3;
    }
+#endif
    for( ; i < n; i++ )
    {
        _AccTp v = a[i] - b[i];
@ -986,12 +993,14 @@ template<typename _Tp, typename _AccTp> static inline
 _AccTp normL1(const _Tp* a, const _Tp* b, int n)
 {
    _AccTp s = 0;
-    int i;
-    for( i = 0; i <= n - 4; i += 4 )
+    int i= 0;
+ #if CV_ENABLE_UNROLLED
+    for(; i <= n - 4; i += 4 )
    {
        _AccTp v0 = a[i] - b[i], v1 = a[i+1] - b[i+1], v2 = a[i+2] - b[i+2], v3 = a[i+3] - b[i+3];
        s += std::abs(v0) + std::abs(v1) + std::abs(v2) + std::abs(v3);
    }
+#endif
    for( ; i < n; i++ )
    {
        _AccTp v = a[i] - b[i];
@ -2422,14 +2431,16 @@ template<typename _Tp> inline typename DataType<_Tp>::work_type
 dot(const Vector<_Tp>& v1, const Vector<_Tp>& v2)
 {
    typedef typename DataType<_Tp>::work_type _Tw;
-    size_t i, n = v1.size();
+    size_t i = 0, n = v1.size();
    assert(v1.size() == v2.size());

    _Tw s = 0;
    const _Tp *ptr1 = &v1[0], *ptr2 = &v2[0];
-    for( i = 0; i <= n - 4; i += 4 )
+ #if CV_ENABLE_UNROLLED
+    for(; i <= n - 4; i += 4 )
        s += (_Tw)ptr1[i]*ptr2[i] + (_Tw)ptr1[i+1]*ptr2[i+1] +
            (_Tw)ptr1[i+2]*ptr2[i+2] + (_Tw)ptr1[i+3]*ptr2[i+3];
+#endif
    for( ; i < n; i++ )
        s += (_Tw)ptr1[i]*ptr2[i];
    return s;
--- a/modules/core/src/arithm.cpp
+++ b/modules/core/src/arithm.cpp
@ -99,7 +99,7 @@ void vBinOp8(const T* src1, size_t step1, const T* src2, size_t step2, T* dst, s
            }
        }
    #endif
-
+#if CV_ENABLE_UNROLLED
        for( ; x <= sz.width - 4; x += 4 )
        {
            T v0 = op(src1[x], src2[x]);
@ -109,7 +109,7 @@ void vBinOp8(const T* src1, size_t step1, const T* src2, size_t step2, T* dst, s
            v1 = op(src1[x+3], src2[x+3]);
            dst[x+2] = v0; dst[x+3] = v1;
        }
-
+#endif
        for( ; x < sz.width; x++ )
            dst[x] = op(src1[x], src2[x]);
    }
@ -208,7 +208,7 @@ void vBinOp32s(const int* src1, size_t step1, const int* src2, size_t step2,
                }
        }
 #endif
-
+#if CV_ENABLE_UNROLLED
        for( ; x <= sz.width - 4; x += 4 )
        {
            int v0 = op(src1[x], src2[x]);
@ -218,7 +218,7 @@ void vBinOp32s(const int* src1, size_t step1, const int* src2, size_t step2,
            v1 = op(src1[x+3], src2[x+3]);
            dst[x+2] = v0; dst[x+3] = v1;
        }
-
+#endif
        for( ; x < sz.width; x++ )
            dst[x] = op(src1[x], src2[x]);
    }
@ -265,6 +265,7 @@ void vBinOp32f(const float* src1, size_t step1, const float* src2, size_t step2,
                }
        }
    #endif
+#if CV_ENABLE_UNROLLED
        for( ; x <= sz.width - 4; x += 4 )
        {
            float v0 = op(src1[x], src2[x]);
@ -274,7 +275,7 @@ void vBinOp32f(const float* src1, size_t step1, const float* src2, size_t step2,
            v1 = op(src1[x+3], src2[x+3]);
            dst[x+2] = v0; dst[x+3] = v1;
        }
-
+#endif
        for( ; x < sz.width; x++ )
            dst[x] = op(src1[x], src2[x]);
    }
@ -1508,8 +1509,9 @@ mul_( const T* src1, size_t step1, const T* src2, size_t step2,
    {
        for( ; size.height--; src1 += step1, src2 += step2, dst += step )
        {
-            int i;
-            for( i = 0; i <= size.width - 4; i += 4 )
+            int i=0;
+			#if CV_ENABLE_UNROLLED
+            for(; i <= size.width - 4; i += 4 )
            {
                T t0;
                T t1;
@ -1523,7 +1525,7 @@ mul_( const T* src1, size_t step1, const T* src2, size_t step2,
                dst[i+2] = t0;
                dst[i+3] = t1;
            }
-
+            #endif
            for( ; i < size.width; i++ )
                dst[i] = saturate_cast<T>(src1[i] * src2[i]);
        }
@ -1532,8 +1534,9 @@ mul_( const T* src1, size_t step1, const T* src2, size_t step2,
    {
        for( ; size.height--; src1 += step1, src2 += step2, dst += step )
        {
-            int i;
-            for( i = 0; i <= size.width - 4; i += 4 )
+            int i = 0;
+			#if CV_ENABLE_UNROLLED
+            for(; i <= size.width - 4; i += 4 )
            {
                T t0 = saturate_cast<T>(scale*(WT)src1[i]*src2[i]);
                T t1 = saturate_cast<T>(scale*(WT)src1[i+1]*src2[i+1]);
@ -1543,7 +1546,7 @@ mul_( const T* src1, size_t step1, const T* src2, size_t step2,
                t1 = saturate_cast<T>(scale*(WT)src1[i+3]*src2[i+3]);
                dst[i+2] = t0; dst[i+3] = t1;
            }
-
+            #endif
            for( ; i < size.width; i++ )
                dst[i] = saturate_cast<T>(scale*(WT)src1[i]*src2[i]);
        }
@ -1561,6 +1564,7 @@ div_( const T* src1, size_t step1, const T* src2, size_t step2,
    for( ; size.height--; src1 += step1, src2 += step2, dst += step )
    {
        int i = 0;
+		#if CV_ENABLE_UNROLLED
        for( ; i <= size.width - 4; i += 4 )
        {
            if( src2[i] != 0 && src2[i+1] != 0 && src2[i+2] != 0 && src2[i+3] != 0 )
@ -1590,7 +1594,7 @@ div_( const T* src1, size_t step1, const T* src2, size_t step2,
                dst[i+2] = z2; dst[i+3] = z3;
            }
        }
-
+        #endif
        for( ; i < size.width; i++ )
            dst[i] = src2[i] != 0 ? saturate_cast<T>(src1[i]*scale/src2[i]) : 0;
    }
@ -1606,6 +1610,7 @@ recip_( const T*, size_t, const T* src2, size_t step2,
    for( ; size.height--; src2 += step2, dst += step )
    {
        int i = 0;
+		#if CV_ENABLE_UNROLLED
        for( ; i <= size.width - 4; i += 4 )
        {
            if( src2[i] != 0 && src2[i+1] != 0 && src2[i+2] != 0 && src2[i+3] != 0 )
@ -1635,7 +1640,7 @@ recip_( const T*, size_t, const T* src2, size_t step2,
                dst[i+2] = z2; dst[i+3] = z3;
            }
        }
-
+        #endif
        for( ; i < size.width; i++ )
            dst[i] = src2[i] != 0 ? saturate_cast<T>(scale/src2[i]) : 0;
    }
@ -1834,6 +1839,7 @@ addWeighted_( const T* src1, size_t step1, const T* src2, size_t step2,
    for( ; size.height--; src1 += step1, src2 += step2, dst += step )
    {
        int x = 0;
+		#if CV_ENABLE_UNROLLED
        for( ; x <= size.width - 4; x += 4 )
        {
            T t0 = saturate_cast<T>(src1[x]*alpha + src2[x]*beta + gamma);
@ -1844,7 +1850,7 @@ addWeighted_( const T* src1, size_t step1, const T* src2, size_t step2,
            t1 = saturate_cast<T>(src1[x+3]*alpha + src2[x+3]*beta + gamma);
            dst[x+2] = t0; dst[x+3] = t1;
        }
-
+        #endif
        for( ; x < size.width; x++ )
            dst[x] = saturate_cast<T>(src1[x]*alpha + src2[x]*beta + gamma);
    }
@ -1891,6 +1897,7 @@ addWeighted8u( const uchar* src1, size_t step1,
            }
        }
 #endif
+		#if CV_ENABLE_UNROLLED
        for( ; x <= size.width - 4; x += 4 )
        {
            float t0, t1;
@ -1906,6 +1913,7 @@ addWeighted8u( const uchar* src1, size_t step1,
            dst[x+2] = saturate_cast<uchar>(t0);
            dst[x+3] = saturate_cast<uchar>(t1);
        }
+        #endif

        for( ; x < size.width; x++ )
        {
@ -1994,6 +2002,7 @@ cmp_(const T* src1, size_t step1, const T* src2, size_t step2,
        for( ; size.height--; src1 += step1, src2 += step2, dst += step )
        {
            int x = 0;
+			#if CV_ENABLE_UNROLLED
            for( ; x <= size.width - 4; x += 4 )
            {
                int t0, t1;
@ -2004,7 +2013,7 @@ cmp_(const T* src1, size_t step1, const T* src2, size_t step2,
                t1 = -(src1[x+3] > src2[x+3]) ^ m;
                dst[x+2] = (uchar)t0; dst[x+3] = (uchar)t1;
            }
-
+            #endif
            for( ; x < size.width; x++ )
                dst[x] = (uchar)(-(src1[x] > src2[x]) ^ m);
 			   }
@ -2015,6 +2024,7 @@ cmp_(const T* src1, size_t step1, const T* src2, size_t step2,
        for( ; size.height--; src1 += step1, src2 += step2, dst += step )
        {
            int x = 0;
+			#if CV_ENABLE_UNROLLED
            for( ; x <= size.width - 4; x += 4 )
            {
                int t0, t1;
@ -2025,7 +2035,7 @@ cmp_(const T* src1, size_t step1, const T* src2, size_t step2,
                t1 = -(src1[x+3] == src2[x+3]) ^ m;
                dst[x+2] = (uchar)t0; dst[x+3] = (uchar)t1;
            }
-
+            #endif
            for( ; x < size.width; x++ )
                dst[x] = (uchar)(-(src1[x] == src2[x]) ^ m);
        }
@ -2382,6 +2392,7 @@ inRange_(const T* src1, size_t step1, const T* src2, size_t step2,
    for( ; size.height--; src1 += step1, src2 += step2, src3 += step3, dst += step )
    {
        int x = 0;
+		#if CV_ENABLE_UNROLLED
        for( ; x <= size.width - 4; x += 4 )
        {
            int t0, t1;
@ -2392,7 +2403,7 @@ inRange_(const T* src1, size_t step1, const T* src2, size_t step2,
            t1 = src2[x+3] <= src1[x+3] && src1[x+3] <= src3[x+3];
            dst[x+2] = (uchar)-t0; dst[x+3] = (uchar)-t1;
        }
-
+        #endif
        for( ; x < size.width; x++ )
            dst[x] = (uchar)-(src2[x] <= src1[x] && src1[x] <= src3[x]);
    }
--- a/modules/core/src/convert.cpp
+++ b/modules/core/src/convert.cpp
@ -578,7 +578,8 @@ cvtScaleAbs_( const T* src, size_t sstep,
            dst[x] = saturate_cast<DT>(std::abs(src[x]*scale + shift));
    }
 }    
-    
+
+
 template<typename T, typename DT, typename WT> static void
 cvtScale_( const T* src, size_t sstep,
           DT* dst, size_t dstep, Size size,
@ -590,6 +591,7 @@ cvtScale_( const T* src, size_t sstep,
    for( ; size.height--; src += sstep, dst += dstep )
    {
        int x = 0;
+#if CV_ENABLE_UNROLLED
        for( ; x <= size.width - 4; x += 4 )
        {
            DT t0, t1;
@ -600,6 +602,7 @@ cvtScale_( const T* src, size_t sstep,
            t1 = saturate_cast<DT>(src[x+3]*scale + shift);
            dst[x+2] = t0; dst[x+3] = t1;
        }
+#endif

        for( ; x < size.width; x++ )
            dst[x] = saturate_cast<DT>(src[x]*scale + shift);
@ -655,22 +658,54 @@ cvt_( const T* src, size_t sstep,
    for( ; size.height--; src += sstep, dst += dstep )
    {
        int x = 0;
-        for( ; x <= size.width - 4; x += 4 )
-        {
-            DT t0, t1;
-            t0 = saturate_cast<DT>(src[x]);
-            t1 = saturate_cast<DT>(src[x+1]);
-            dst[x] = t0; dst[x+1] = t1;
-            t0 = saturate_cast<DT>(src[x+2]);
-            t1 = saturate_cast<DT>(src[x+3]);
-            dst[x+2] = t0; dst[x+3] = t1;
-        }
-        
+		for( ; x <= size.width - 4; x += 4 )
+		{
+			DT t0, t1;
+			t0 = saturate_cast<DT>(src[x]);
+			t1 = saturate_cast<DT>(src[x+1]);
+			dst[x] = t0; dst[x+1] = t1;
+			t0 = saturate_cast<DT>(src[x+2]);
+			t1 = saturate_cast<DT>(src[x+3]);
+			dst[x+2] = t0; dst[x+3] = t1;
+		}
        for( ; x < size.width; x++ )
            dst[x] = saturate_cast<DT>(src[x]);
    }
 }

+//vz optimized template specialization, test Core_ConvertScale/ElemWiseTest
+template<> static void
+cvt_<float, short>( const float* src, size_t sstep,
+     short* dst, size_t dstep, Size size )
+{
+    sstep /= sizeof(src[0]);
+    dstep /= sizeof(dst[0]);
+ 
+    for( ; size.height--; src += sstep, dst += dstep )
+    {
+        int x = 0;
+		#if CV_SSE2
+		if(USE_SSE2){
+			  for( ; x <= size.width - 8; x += 8 )
+			{
+				__m128 src128 = _mm_loadu_ps (src + x);
+				__m128i src_int128 = _mm_cvtps_epi32 (src128);
+	
+				src128 = _mm_loadu_ps (src + x + 4); 
+				__m128i src1_int128 = _mm_cvtps_epi32 (src128);
+				
+				src1_int128 = _mm_packs_epi32(src_int128, src1_int128);
+				_mm_storeu_si128((__m128i*)(dst + x),src1_int128);
+			}
+		}
+        #endif
+        for( ; x < size.width; x++ )
+            dst[x] = (src[x]);
+    }
+
+}
+ 
+
 template<typename T> static void
 cpy_( const T* src, size_t sstep, T* dst, size_t dstep, Size size )
 {
--- a/modules/core/src/copy.cpp
+++ b/modules/core/src/copy.cpp
@ -59,6 +59,7 @@ copyMask_(const uchar* _src, size_t sstep, const uchar* mask, size_t mstep, ucha
        const T* src = (const T*)_src;
        T* dst = (T*)_dst;
        int x = 0;
+		 #if CV_ENABLE_UNROLLED
        for( ; x <= size.width - 4; x += 4 )
        {
            if( mask[x] )
@ -70,6 +71,7 @@ copyMask_(const uchar* _src, size_t sstep, const uchar* mask, size_t mstep, ucha
            if( mask[x+3] )
                dst[x+3] = src[x+3];
        }
+        #endif
        for( ; x < size.width; x++ )
            if( mask[x] )
                dst[x] = src[x];
--- a/modules/core/src/lapack.cpp
+++ b/modules/core/src/lapack.cpp
@ -741,7 +741,9 @@ MatrAXPY( int m, int n, const T1* x, int dx,
    for( i = 0; i < m; i++, x += dx, y += dy )
    {
        T2 s = a[i*inca];
-        for( j = 0; j <= n - 4; j += 4 )
+		j=0;
+		 #if CV_ENABLE_UNROLLED
+        for(; j <= n - 4; j += 4 )
        {
            T3 t0 = (T3)(y[j]   + s*x[j]);
            T3 t1 = (T3)(y[j+1] + s*x[j+1]);
@ -752,7 +754,7 @@ MatrAXPY( int m, int n, const T1* x, int dx,
            y[j+2] = t0;
            y[j+3] = t1;
        }
-        
+        #endif
        for( ; j < n; j++ )
            y[j] = (T3)(y[j] + s*x[j]);
    }
--- a/modules/core/src/matmul.cpp
+++ b/modules/core/src/matmul.cpp
@ -63,7 +63,9 @@ GEMM_CopyBlock( const uchar* src, size_t src_step,

    for( ; size.height--; src += src_step, dst += dst_step )
    {
-        for( j = 0; j <= size.width - 4; j += 4 )
+		j=0;
+         #if CV_ENABLE_UNROLLED
+        for( ; j <= size.width - 4; j += 4 )
        {
            int t0 = ((const int*)src)[j];
            int t1 = ((const int*)src)[j+1];
@ -74,7 +76,7 @@ GEMM_CopyBlock( const uchar* src, size_t src_step,
            ((int*)dst)[j+2] = t0;
            ((int*)dst)[j+3] = t1;
        }
-
+        #endif
        for( ; j < size.width; j++ )
            ((int*)dst)[j] = ((const int*)src)[j];
    }
@ -237,15 +239,16 @@ GEMMSingleMul( const T* a_data, size_t a_step,
                                               c_data += c_step1 )
            {
                WT s0(0), s1(0), s2(0), s3(0);
-
-                for( k = 0; k <= n - 4; k += 4 )
+                k = 0;
+                 #if CV_ENABLE_UNROLLED
+                for( ; k <= n - 4; k += 4 )
                {
                    s0 += WT(a_data[k])*WT(b_data[k]);
                    s1 += WT(a_data[k+1])*WT(b_data[k+1]);
                    s2 += WT(a_data[k+2])*WT(b_data[k+2]);
                    s3 += WT(a_data[k+3])*WT(b_data[k+3]);
                }
-
+                #endif
                for( ; k < n; k++ )
                    s0 += WT(a_data[k])*WT(b_data[k]);
                s0 = (s0+s1+s2+s3)*alpha;
@ -342,8 +345,9 @@ GEMMSingleMul( const T* a_data, size_t a_step,
            for( k = 0; k < n; k++, b_data += b_step )
            {
                WT al(a_data[k]);
-
-                for( j = 0; j <= m - 4; j += 4 )
+				j=0;
+                 #if CV_ENABLE_UNROLLED
+                for(; j <= m - 4; j += 4 )
                {
                    WT t0 = d_buf[j] + WT(b_data[j])*al;
                    WT t1 = d_buf[j+1] + WT(b_data[j+1])*al;
@ -354,7 +358,7 @@ GEMMSingleMul( const T* a_data, size_t a_step,
                    d_buf[j+2] = t0;
                    d_buf[j+3] = t1;
                }
-
+                #endif
                for( ; j < m; j++ )
                    d_buf[j] += WT(b_data[j])*al;
            }
@ -509,7 +513,9 @@ GEMMStore( const T* c_data, size_t c_step,
        if( _c_data )
        {
            c_data = _c_data;
-            for( j = 0; j <= d_size.width - 4; j += 4, c_data += 4*c_step1 )
+			j=0;
+			 #if CV_ENABLE_UNROLLED
+            for(; j <= d_size.width - 4; j += 4, c_data += 4*c_step1 )
            {
                WT t0 = alpha*d_buf[j];
                WT t1 = alpha*d_buf[j+1];
@ -524,6 +530,7 @@ GEMMStore( const T* c_data, size_t c_step,
                d_data[j+2] = T(t0);
                d_data[j+3] = T(t1);
            }
+            #endif
            for( ; j < d_size.width; j++, c_data += c_step1 )
            {
                WT t0 = alpha*d_buf[j];
@ -532,7 +539,9 @@ GEMMStore( const T* c_data, size_t c_step,
        }
        else
        {
-            for( j = 0; j <= d_size.width - 4; j += 4 )
+			j = 0;
+			 #if CV_ENABLE_UNROLLED
+            for( ; j <= d_size.width - 4; j += 4 )
            {
                WT t0 = alpha*d_buf[j];
                WT t1 = alpha*d_buf[j+1];
@ -543,6 +552,7 @@ GEMMStore( const T* c_data, size_t c_step,
                d_data[j+2] = T(t0);
                d_data[j+3] = T(t1);
            }
+			#endif
            for( ; j < d_size.width; j++ )
                d_data[j] = T(alpha*d_buf[j]);
        }
@ -1987,6 +1997,7 @@ static void scaleAdd_32f(const float* src1, const float* src2, float* dst,
    }
    else
 #endif
+    //vz why do we need unroll here?
    for( ; i <= len - 4; i += 4 )
    {
        float t0, t1;
@ -1997,7 +2008,7 @@ static void scaleAdd_32f(const float* src1, const float* src2, float* dst,
        t1 = src1[i+3]*alpha + src2[i+3];
        dst[i+2] = t0; dst[i+3] = t1;
    }
-    for( ; i < len; i++ )
+	for(; i < len; i++ )
        dst[i] = src1[i]*alpha + src2[i];
 }

@ -2024,6 +2035,7 @@ static void scaleAdd_64f(const double* src1, const double* src2, double* dst,
    }
    else
 #endif
+     //vz why do we need unroll here? 
    for( ; i <= len - 4; i += 4 )
    {
        double t0, t1;
@ -2034,7 +2046,7 @@ static void scaleAdd_64f(const double* src1, const double* src2, double* dst,
        t1 = src1[i+3]*alpha + src2[i+3];
        dst[i+2] = t0; dst[i+3] = t1;
    }
-    for( ; i < len; i++ )
+	for(; i < len; i++ )
        dst[i] = src1[i]*alpha + src2[i];
 }

@ -2198,9 +2210,12 @@ double cv::Mahalanobis( InputArray _v1, InputArray _v2, InputArray _icovar )
        for( i = 0; i < len; i++, mat += matstep )
        {
            double row_sum = 0;
-            for( j = 0; j <= len - 4; j += 4 )
+            j = 0;
+			 #if CV_ENABLE_UNROLLED
+            for(; j <= len - 4; j += 4 )
                row_sum += diff[j]*mat[j] + diff[j+1]*mat[j+1] +
                           diff[j+2]*mat[j+2] + diff[j+3]*mat[j+3];
+            #endif
            for( ; j < len; j++ )
                row_sum += diff[j]*mat[j];
            result += row_sum * diff[i];
@ -2226,9 +2241,12 @@ double cv::Mahalanobis( InputArray _v1, InputArray _v2, InputArray _icovar )
        for( i = 0; i < len; i++, mat += matstep )
        {
            double row_sum = 0;
-            for( j = 0; j <= len - 4; j += 4 )
+            j = 0;
+			 #if CV_ENABLE_UNROLLED
+            for(; j <= len - 4; j += 4 )
                row_sum += diff[j]*mat[j] + diff[j+1]*mat[j+1] +
                           diff[j+2]*mat[j+2] + diff[j+3]*mat[j+3];
+            #endif
            for( ; j < len; j++ )
                row_sum += diff[j]*mat[j];
            result += row_sum * diff[i];
@ -2574,9 +2592,11 @@ dotProd_(const T* src1, const T* src2, int len)
 {
    int i = 0;
    double result = 0;
+	 #if CV_ENABLE_UNROLLED
    for( ; i <= len - 4; i += 4 )
        result += (double)src1[i]*src2[i] + (double)src1[i+1]*src2[i+1] +
            (double)src1[i+2]*src2[i+2] + (double)src1[i+3]*src2[i+3];
+    #endif
    for( ; i < len; i++ )
        result += (double)src1[i]*src2[i];

--- a/modules/core/src/matrix.cpp
+++ b/modules/core/src/matrix.cpp
@ -1657,9 +1657,10 @@ namespace cv
 template<typename T> static void
 transpose_( const uchar* src, size_t sstep, uchar* dst, size_t dstep, Size sz )
 {
-    int i, j, m = sz.width, n = sz.height;
-    
-    for( i = 0; i <= m - 4; i += 4 )
+    int i=0, j, m = sz.width, n = sz.height;
+
+	#if CV_ENABLE_UNROLLED
+    for(; i <= m - 4; i += 4 )
    {
        T* d0 = (T*)(dst + dstep*i);
        T* d1 = (T*)(dst + dstep*(i+1));
@ -1685,12 +1686,13 @@ transpose_( const uchar* src, size_t sstep, uchar* dst, size_t dstep, Size sz )
            d0[j] = s0[0]; d1[j] = s0[1]; d2[j] = s0[2]; d3[j] = s0[3];
        }
    }
-    
+    #endif
    for( ; i < m; i++ )
    {
        T* d0 = (T*)(dst + dstep*i);
-        
-        for( j = 0; j <= n - 4; j += 4 )
+        j = 0;
+		#if CV_ENABLE_UNROLLED
+        for(; j <= n - 4; j += 4 )
        {
            const T* s0 = (const T*)(src + i*sizeof(T) + sstep*j);
            const T* s1 = (const T*)(src + i*sizeof(T) + sstep*(j+1));
@ -1699,7 +1701,7 @@ transpose_( const uchar* src, size_t sstep, uchar* dst, size_t dstep, Size sz )
            
            d0[j] = s0[0]; d0[j+1] = s1[0]; d0[j+2] = s2[0]; d0[j+3] = s3[0];
        }
-        
+        #endif
        for( ; j < n; j++ )
        {
            const T* s0 = (const T*)(src + i*sizeof(T) + j*sstep);
@ -1878,7 +1880,9 @@ reduceR_( const Mat& srcmat, Mat& dstmat )
    for( ; --size.height; )
    {
        src += srcstep;
-        for( i = 0; i <= size.width - 4; i += 4 )
+        i = 0;
+		#if CV_ENABLE_UNROLLED
+        for(; i <= size.width - 4; i += 4 )
        {
            WT s0, s1;
            s0 = op(buf[i], (WT)src[i]);
@ -1889,7 +1893,7 @@ reduceR_( const Mat& srcmat, Mat& dstmat )
            s1 = op(buf[i+3], (WT)src[i+3]);
            buf[i+2] = s0; buf[i+3] = s1;
        }
-
+        #endif
        for( ; i < size.width; i++ )
            buf[i] = op(buf[i], (WT)src[i]);
    }
@ -2467,7 +2471,9 @@ double cv::kmeans( InputArray _data, int K,
                    sample = data.ptr<float>(i);
                    k = labels[i];
                    float* center = centers.ptr<float>(k);
-                    for( j = 0; j <= dims - 4; j += 4 )
+					j=0;
+					#if CV_ENABLE_UNROLLED
+                    for(; j <= dims - 4; j += 4 )
                    {
                        float t0 = center[j] + sample[j];
                        float t1 = center[j+1] + sample[j+1];
@ -2481,6 +2487,7 @@ double cv::kmeans( InputArray _data, int K,
                        center[j+2] = t0;
                        center[j+3] = t1;
                    }
+                    #endif
                    for( ; j < dims; j++ )
                        center[j] += sample[j];
                    counters[k]++;
--- a/modules/core/src/stat.cpp
+++ b/modules/core/src/stat.cpp
@ -65,13 +65,16 @@ static int sum_(const T* src0, const uchar* mask, ST* dst, int len, int cn )
    const T* src = src0;
    if( !mask )
    {
-        int i;
+        int i=0;
        int k = cn % 4;
        if( k == 1 )
        {
            ST s0 = dst[0];
-            for( i = 0; i <= len - 4; i += 4, src += cn*4 )
+
+			#if CV_ENABLE_UNROLLED
+            for(; i <= len - 4; i += 4, src += cn*4 )
                s0 += src[0] + src[cn] + src[cn*2] + src[cn*3];
+            #endif
            for( ; i < len; i++, src += cn )
                s0 += src[0];
            dst[0] = s0;
@ -151,6 +154,7 @@ static int sum_(const T* src0, const uchar* mask, ST* dst, int len, int cn )
            if( mask[i] )
            {
                int k = 0;
+				#if CV_ENABLE_UNROLLED
                for( ; k <= cn - 4; k += 4 )
                {
                    ST s0, s1;
@ -161,6 +165,7 @@ static int sum_(const T* src0, const uchar* mask, ST* dst, int len, int cn )
                    s1 = dst[k+3] + src[k+3];
                    dst[k+2] = s0; dst[k+3] = s1;
                }
+                #endif
                for( ; k < cn; k++ )
                    dst[k] += src[k];
                nzm++;
@ -205,9 +210,11 @@ static SumFunc sumTab[] =
 template<typename T>
 static int countNonZero_(const T* src, int len )
 {
-    int i, nz = 0;
-    for( i = 0; i <= len - 4; i += 4 )
+    int i=0, nz = 0;
+	#if CV_ENABLE_UNROLLED
+    for(; i <= len - 4; i += 4 )
        nz += (src[i] != 0) + (src[i+1] != 0) + (src[i+2] != 0) + (src[i+3] != 0);
+    #endif
    for( ; i < len; i++ )
        nz += src[i] != 0;
    return nz;
@ -826,14 +833,15 @@ float normL2Sqr_(const float* a, const float* b, int n)
    }
    else
 #endif
-    {
+    //vz why do we need unroll here? no sse = no need to unroll
+	{
        for( ; j <= n - 4; j += 4 )
        {
            float t0 = a[j] - b[j], t1 = a[j+1] - b[j+1], t2 = a[j+2] - b[j+2], t3 = a[j+3] - b[j+3];
            d += t0*t0 + t1*t1 + t2*t2 + t3*t3;
        }
    }
-    
+ 
    for( ; j < n; j++ )
    {
        float t = a[j] - b[j];
@ -866,6 +874,7 @@ float normL1_(const float* a, const float* b, int n)
    }
    else
 #endif
+     //vz no need to unroll here - if no sse
    {
        for( ; j <= n - 4; j += 4 )
        {
@ -873,7 +882,7 @@ float normL1_(const float* a, const float* b, int n)
                    std::abs(a[j+2] - b[j+2]) + std::abs(a[j+3] - b[j+3]);
        }
    }
-    
+
    for( ; j < n; j++ )
        d += std::abs(a[j] - b[j]);
    return d;
@ -906,6 +915,7 @@ int normL1_(const uchar* a, const uchar* b, int n)
    }
    else
 #endif
+     //vz why do we need unroll here? no sse = no unroll
    {
        for( ; j <= n - 4; j += 4 )
        {
@ -913,7 +923,6 @@ int normL1_(const uchar* a, const uchar* b, int n)
                    std::abs(a[j+2] - b[j+2]) + std::abs(a[j+3] - b[j+3]);
        }
    }
-    
    for( ; j < n; j++ )
        d += std::abs(a[j] - b[j]);
    return d;
@ -997,9 +1006,11 @@ int normHamming(const uchar* a, const uchar* b, int n, int cellSize)
    else
        CV_Error( CV_StsBadSize, "bad cell size (not 1, 2 or 4) in normHamming" );
    int i = 0, result = 0;
+	#if CV_ENABLE_UNROLLED
    for( ; i <= n - 4; i += 4 )
        result += tab[a[i] ^ b[i]] + tab[a[i+1] ^ b[i+1]] +
                tab[a[i+2] ^ b[i+2]] + tab[a[i+3] ^ b[i+3]];
+    #endif
    for( ; i < n; i++ )
        result += tab[a[i] ^ b[i]];
    return result;