From fbdb93ec790a5ef140c9371a8a51278c12649dda Mon Sep 17 00:00:00 2001 From: Victoria Zhislina Date: Fri, 10 Feb 2012 06:05:04 +0000 Subject: [PATCH] CV_ENABLE_UNROLLED --- .../core/include/opencv2/core/internal.hpp | 6 ++ .../core/include/opencv2/core/operations.hpp | 31 ++++++---- modules/core/src/arithm.cpp | 45 ++++++++------ modules/core/src/convert.cpp | 59 +++++++++++++++---- modules/core/src/copy.cpp | 2 + modules/core/src/lapack.cpp | 6 +- modules/core/src/matmul.cpp | 48 ++++++++++----- modules/core/src/matrix.cpp | 27 +++++---- modules/core/src/stat.cpp | 27 ++++++--- 9 files changed, 178 insertions(+), 73 deletions(-) diff --git a/modules/core/include/opencv2/core/internal.hpp b/modules/core/include/opencv2/core/internal.hpp index b0a68db4ec..e9af13ce64 100644 --- a/modules/core/include/opencv2/core/internal.hpp +++ b/modules/core/include/opencv2/core/internal.hpp @@ -131,6 +131,12 @@ CV_INLINE IppiSize ippiSize(int width, int height) #define CPU_HAS_NEON_FEATURE (false) #endif +#ifdef CV_ICC +#define CV_ENABLE_UNROLLED 0 +#else +#define CV_ENABLE_UNROLLED 1 +#endif + #ifndef IPPI_CALL #define IPPI_CALL(func) CV_Assert((func) >= 0) #endif diff --git a/modules/core/include/opencv2/core/operations.hpp b/modules/core/include/opencv2/core/operations.hpp index 4e71816027..5da73f02ec 100644 --- a/modules/core/include/opencv2/core/operations.hpp +++ b/modules/core/include/opencv2/core/operations.hpp @@ -48,6 +48,7 @@ #include #endif // SKIP_INCLUDES + #ifdef __cplusplus /////// exchange-add operation for atomic operations on reference counters /////// @@ -903,12 +904,14 @@ template static inline _AccTp normL2Sqr(const _Tp* a, int n) { _AccTp s = 0; - int i; - for( i = 0; i <= n - 4; i += 4 ) + int i=0; + #if CV_ENABLE_UNROLLED + for( ; i <= n - 4; i += 4 ) { _AccTp v0 = a[i], v1 = a[i+1], v2 = a[i+2], v3 = a[i+3]; s += v0*v0 + v1*v1 + v2*v2 + v3*v3; } +#endif for( ; i < n; i++ ) { _AccTp v = a[i]; @@ -922,12 +925,14 @@ template static inline _AccTp normL1(const _Tp* a, int n) { _AccTp s = 0; - int i; - for( i = 0; i <= n - 4; i += 4 ) + int i = 0; +#if CV_ENABLE_UNROLLED + for(; i <= n - 4; i += 4 ) { s += (_AccTp)fast_abs(a[i]) + (_AccTp)fast_abs(a[i+1]) + (_AccTp)fast_abs(a[i+2]) + (_AccTp)fast_abs(a[i+3]); } +#endif for( ; i < n; i++ ) s += fast_abs(a[i]); return s; @@ -948,12 +953,14 @@ template static inline _AccTp normL2Sqr(const _Tp* a, const _Tp* b, int n) { _AccTp s = 0; - int i; - for( i = 0; i <= n - 4; i += 4 ) + int i= 0; + #if CV_ENABLE_UNROLLED + for(; i <= n - 4; i += 4 ) { _AccTp v0 = a[i] - b[i], v1 = a[i+1] - b[i+1], v2 = a[i+2] - b[i+2], v3 = a[i+3] - b[i+3]; s += v0*v0 + v1*v1 + v2*v2 + v3*v3; } +#endif for( ; i < n; i++ ) { _AccTp v = a[i] - b[i]; @@ -986,12 +993,14 @@ template static inline _AccTp normL1(const _Tp* a, const _Tp* b, int n) { _AccTp s = 0; - int i; - for( i = 0; i <= n - 4; i += 4 ) + int i= 0; + #if CV_ENABLE_UNROLLED + for(; i <= n - 4; i += 4 ) { _AccTp v0 = a[i] - b[i], v1 = a[i+1] - b[i+1], v2 = a[i+2] - b[i+2], v3 = a[i+3] - b[i+3]; s += std::abs(v0) + std::abs(v1) + std::abs(v2) + std::abs(v3); } +#endif for( ; i < n; i++ ) { _AccTp v = a[i] - b[i]; @@ -2422,14 +2431,16 @@ template inline typename DataType<_Tp>::work_type dot(const Vector<_Tp>& v1, const Vector<_Tp>& v2) { typedef typename DataType<_Tp>::work_type _Tw; - size_t i, n = v1.size(); + size_t i = 0, n = v1.size(); assert(v1.size() == v2.size()); _Tw s = 0; const _Tp *ptr1 = &v1[0], *ptr2 = &v2[0]; - for( i = 0; i <= n - 4; i += 4 ) + #if CV_ENABLE_UNROLLED + for(; i <= n - 4; i += 4 ) s += (_Tw)ptr1[i]*ptr2[i] + (_Tw)ptr1[i+1]*ptr2[i+1] + (_Tw)ptr1[i+2]*ptr2[i+2] + (_Tw)ptr1[i+3]*ptr2[i+3]; +#endif for( ; i < n; i++ ) s += (_Tw)ptr1[i]*ptr2[i]; return s; diff --git a/modules/core/src/arithm.cpp b/modules/core/src/arithm.cpp index e1879580e3..9c46cfa23a 100644 --- a/modules/core/src/arithm.cpp +++ b/modules/core/src/arithm.cpp @@ -99,7 +99,7 @@ void vBinOp8(const T* src1, size_t step1, const T* src2, size_t step2, T* dst, s } } #endif - +#if CV_ENABLE_UNROLLED for( ; x <= sz.width - 4; x += 4 ) { T v0 = op(src1[x], src2[x]); @@ -109,7 +109,7 @@ void vBinOp8(const T* src1, size_t step1, const T* src2, size_t step2, T* dst, s v1 = op(src1[x+3], src2[x+3]); dst[x+2] = v0; dst[x+3] = v1; } - +#endif for( ; x < sz.width; x++ ) dst[x] = op(src1[x], src2[x]); } @@ -208,7 +208,7 @@ void vBinOp32s(const int* src1, size_t step1, const int* src2, size_t step2, } } #endif - +#if CV_ENABLE_UNROLLED for( ; x <= sz.width - 4; x += 4 ) { int v0 = op(src1[x], src2[x]); @@ -218,7 +218,7 @@ void vBinOp32s(const int* src1, size_t step1, const int* src2, size_t step2, v1 = op(src1[x+3], src2[x+3]); dst[x+2] = v0; dst[x+3] = v1; } - +#endif for( ; x < sz.width; x++ ) dst[x] = op(src1[x], src2[x]); } @@ -265,6 +265,7 @@ void vBinOp32f(const float* src1, size_t step1, const float* src2, size_t step2, } } #endif +#if CV_ENABLE_UNROLLED for( ; x <= sz.width - 4; x += 4 ) { float v0 = op(src1[x], src2[x]); @@ -274,7 +275,7 @@ void vBinOp32f(const float* src1, size_t step1, const float* src2, size_t step2, v1 = op(src1[x+3], src2[x+3]); dst[x+2] = v0; dst[x+3] = v1; } - +#endif for( ; x < sz.width; x++ ) dst[x] = op(src1[x], src2[x]); } @@ -1508,8 +1509,9 @@ mul_( const T* src1, size_t step1, const T* src2, size_t step2, { for( ; size.height--; src1 += step1, src2 += step2, dst += step ) { - int i; - for( i = 0; i <= size.width - 4; i += 4 ) + int i=0; + #if CV_ENABLE_UNROLLED + for(; i <= size.width - 4; i += 4 ) { T t0; T t1; @@ -1523,7 +1525,7 @@ mul_( const T* src1, size_t step1, const T* src2, size_t step2, dst[i+2] = t0; dst[i+3] = t1; } - + #endif for( ; i < size.width; i++ ) dst[i] = saturate_cast(src1[i] * src2[i]); } @@ -1532,8 +1534,9 @@ mul_( const T* src1, size_t step1, const T* src2, size_t step2, { for( ; size.height--; src1 += step1, src2 += step2, dst += step ) { - int i; - for( i = 0; i <= size.width - 4; i += 4 ) + int i = 0; + #if CV_ENABLE_UNROLLED + for(; i <= size.width - 4; i += 4 ) { T t0 = saturate_cast(scale*(WT)src1[i]*src2[i]); T t1 = saturate_cast(scale*(WT)src1[i+1]*src2[i+1]); @@ -1543,7 +1546,7 @@ mul_( const T* src1, size_t step1, const T* src2, size_t step2, t1 = saturate_cast(scale*(WT)src1[i+3]*src2[i+3]); dst[i+2] = t0; dst[i+3] = t1; } - + #endif for( ; i < size.width; i++ ) dst[i] = saturate_cast(scale*(WT)src1[i]*src2[i]); } @@ -1561,6 +1564,7 @@ div_( const T* src1, size_t step1, const T* src2, size_t step2, for( ; size.height--; src1 += step1, src2 += step2, dst += step ) { int i = 0; + #if CV_ENABLE_UNROLLED for( ; i <= size.width - 4; i += 4 ) { if( src2[i] != 0 && src2[i+1] != 0 && src2[i+2] != 0 && src2[i+3] != 0 ) @@ -1590,7 +1594,7 @@ div_( const T* src1, size_t step1, const T* src2, size_t step2, dst[i+2] = z2; dst[i+3] = z3; } } - + #endif for( ; i < size.width; i++ ) dst[i] = src2[i] != 0 ? saturate_cast(src1[i]*scale/src2[i]) : 0; } @@ -1606,6 +1610,7 @@ recip_( const T*, size_t, const T* src2, size_t step2, for( ; size.height--; src2 += step2, dst += step ) { int i = 0; + #if CV_ENABLE_UNROLLED for( ; i <= size.width - 4; i += 4 ) { if( src2[i] != 0 && src2[i+1] != 0 && src2[i+2] != 0 && src2[i+3] != 0 ) @@ -1635,7 +1640,7 @@ recip_( const T*, size_t, const T* src2, size_t step2, dst[i+2] = z2; dst[i+3] = z3; } } - + #endif for( ; i < size.width; i++ ) dst[i] = src2[i] != 0 ? saturate_cast(scale/src2[i]) : 0; } @@ -1834,6 +1839,7 @@ addWeighted_( const T* src1, size_t step1, const T* src2, size_t step2, for( ; size.height--; src1 += step1, src2 += step2, dst += step ) { int x = 0; + #if CV_ENABLE_UNROLLED for( ; x <= size.width - 4; x += 4 ) { T t0 = saturate_cast(src1[x]*alpha + src2[x]*beta + gamma); @@ -1844,7 +1850,7 @@ addWeighted_( const T* src1, size_t step1, const T* src2, size_t step2, t1 = saturate_cast(src1[x+3]*alpha + src2[x+3]*beta + gamma); dst[x+2] = t0; dst[x+3] = t1; } - + #endif for( ; x < size.width; x++ ) dst[x] = saturate_cast(src1[x]*alpha + src2[x]*beta + gamma); } @@ -1891,6 +1897,7 @@ addWeighted8u( const uchar* src1, size_t step1, } } #endif + #if CV_ENABLE_UNROLLED for( ; x <= size.width - 4; x += 4 ) { float t0, t1; @@ -1906,6 +1913,7 @@ addWeighted8u( const uchar* src1, size_t step1, dst[x+2] = saturate_cast(t0); dst[x+3] = saturate_cast(t1); } + #endif for( ; x < size.width; x++ ) { @@ -1994,6 +2002,7 @@ cmp_(const T* src1, size_t step1, const T* src2, size_t step2, for( ; size.height--; src1 += step1, src2 += step2, dst += step ) { int x = 0; + #if CV_ENABLE_UNROLLED for( ; x <= size.width - 4; x += 4 ) { int t0, t1; @@ -2004,7 +2013,7 @@ cmp_(const T* src1, size_t step1, const T* src2, size_t step2, t1 = -(src1[x+3] > src2[x+3]) ^ m; dst[x+2] = (uchar)t0; dst[x+3] = (uchar)t1; } - + #endif for( ; x < size.width; x++ ) dst[x] = (uchar)(-(src1[x] > src2[x]) ^ m); } @@ -2015,6 +2024,7 @@ cmp_(const T* src1, size_t step1, const T* src2, size_t step2, for( ; size.height--; src1 += step1, src2 += step2, dst += step ) { int x = 0; + #if CV_ENABLE_UNROLLED for( ; x <= size.width - 4; x += 4 ) { int t0, t1; @@ -2025,7 +2035,7 @@ cmp_(const T* src1, size_t step1, const T* src2, size_t step2, t1 = -(src1[x+3] == src2[x+3]) ^ m; dst[x+2] = (uchar)t0; dst[x+3] = (uchar)t1; } - + #endif for( ; x < size.width; x++ ) dst[x] = (uchar)(-(src1[x] == src2[x]) ^ m); } @@ -2382,6 +2392,7 @@ inRange_(const T* src1, size_t step1, const T* src2, size_t step2, for( ; size.height--; src1 += step1, src2 += step2, src3 += step3, dst += step ) { int x = 0; + #if CV_ENABLE_UNROLLED for( ; x <= size.width - 4; x += 4 ) { int t0, t1; @@ -2392,7 +2403,7 @@ inRange_(const T* src1, size_t step1, const T* src2, size_t step2, t1 = src2[x+3] <= src1[x+3] && src1[x+3] <= src3[x+3]; dst[x+2] = (uchar)-t0; dst[x+3] = (uchar)-t1; } - + #endif for( ; x < size.width; x++ ) dst[x] = (uchar)-(src2[x] <= src1[x] && src1[x] <= src3[x]); } diff --git a/modules/core/src/convert.cpp b/modules/core/src/convert.cpp index aaf0f08094..945b2fe255 100644 --- a/modules/core/src/convert.cpp +++ b/modules/core/src/convert.cpp @@ -578,7 +578,8 @@ cvtScaleAbs_( const T* src, size_t sstep, dst[x] = saturate_cast
(std::abs(src[x]*scale + shift)); } } - + + template static void cvtScale_( const T* src, size_t sstep, DT* dst, size_t dstep, Size size, @@ -590,6 +591,7 @@ cvtScale_( const T* src, size_t sstep, for( ; size.height--; src += sstep, dst += dstep ) { int x = 0; +#if CV_ENABLE_UNROLLED for( ; x <= size.width - 4; x += 4 ) { DT t0, t1; @@ -600,6 +602,7 @@ cvtScale_( const T* src, size_t sstep, t1 = saturate_cast
(src[x+3]*scale + shift); dst[x+2] = t0; dst[x+3] = t1; } +#endif for( ; x < size.width; x++ ) dst[x] = saturate_cast
(src[x]*scale + shift); @@ -655,22 +658,54 @@ cvt_( const T* src, size_t sstep, for( ; size.height--; src += sstep, dst += dstep ) { int x = 0; - for( ; x <= size.width - 4; x += 4 ) - { - DT t0, t1; - t0 = saturate_cast
(src[x]); - t1 = saturate_cast
(src[x+1]); - dst[x] = t0; dst[x+1] = t1; - t0 = saturate_cast
(src[x+2]); - t1 = saturate_cast
(src[x+3]); - dst[x+2] = t0; dst[x+3] = t1; - } - + for( ; x <= size.width - 4; x += 4 ) + { + DT t0, t1; + t0 = saturate_cast
(src[x]); + t1 = saturate_cast
(src[x+1]); + dst[x] = t0; dst[x+1] = t1; + t0 = saturate_cast
(src[x+2]); + t1 = saturate_cast
(src[x+3]); + dst[x+2] = t0; dst[x+3] = t1; + } for( ; x < size.width; x++ ) dst[x] = saturate_cast
(src[x]); } } +//vz optimized template specialization, test Core_ConvertScale/ElemWiseTest +template<> static void +cvt_( const float* src, size_t sstep, + short* dst, size_t dstep, Size size ) +{ + sstep /= sizeof(src[0]); + dstep /= sizeof(dst[0]); + + for( ; size.height--; src += sstep, dst += dstep ) + { + int x = 0; + #if CV_SSE2 + if(USE_SSE2){ + for( ; x <= size.width - 8; x += 8 ) + { + __m128 src128 = _mm_loadu_ps (src + x); + __m128i src_int128 = _mm_cvtps_epi32 (src128); + + src128 = _mm_loadu_ps (src + x + 4); + __m128i src1_int128 = _mm_cvtps_epi32 (src128); + + src1_int128 = _mm_packs_epi32(src_int128, src1_int128); + _mm_storeu_si128((__m128i*)(dst + x),src1_int128); + } + } + #endif + for( ; x < size.width; x++ ) + dst[x] = (src[x]); + } + +} + + template static void cpy_( const T* src, size_t sstep, T* dst, size_t dstep, Size size ) { diff --git a/modules/core/src/copy.cpp b/modules/core/src/copy.cpp index 43f601c367..c0d0501550 100644 --- a/modules/core/src/copy.cpp +++ b/modules/core/src/copy.cpp @@ -59,6 +59,7 @@ copyMask_(const uchar* _src, size_t sstep, const uchar* mask, size_t mstep, ucha const T* src = (const T*)_src; T* dst = (T*)_dst; int x = 0; + #if CV_ENABLE_UNROLLED for( ; x <= size.width - 4; x += 4 ) { if( mask[x] ) @@ -70,6 +71,7 @@ copyMask_(const uchar* _src, size_t sstep, const uchar* mask, size_t mstep, ucha if( mask[x+3] ) dst[x+3] = src[x+3]; } + #endif for( ; x < size.width; x++ ) if( mask[x] ) dst[x] = src[x]; diff --git a/modules/core/src/lapack.cpp b/modules/core/src/lapack.cpp index 95c75292e0..4122b82265 100644 --- a/modules/core/src/lapack.cpp +++ b/modules/core/src/lapack.cpp @@ -741,7 +741,9 @@ MatrAXPY( int m, int n, const T1* x, int dx, for( i = 0; i < m; i++, x += dx, y += dy ) { T2 s = a[i*inca]; - for( j = 0; j <= n - 4; j += 4 ) + j=0; + #if CV_ENABLE_UNROLLED + for(; j <= n - 4; j += 4 ) { T3 t0 = (T3)(y[j] + s*x[j]); T3 t1 = (T3)(y[j+1] + s*x[j+1]); @@ -752,7 +754,7 @@ MatrAXPY( int m, int n, const T1* x, int dx, y[j+2] = t0; y[j+3] = t1; } - + #endif for( ; j < n; j++ ) y[j] = (T3)(y[j] + s*x[j]); } diff --git a/modules/core/src/matmul.cpp b/modules/core/src/matmul.cpp index 409bbd45fa..fc655b6dc0 100644 --- a/modules/core/src/matmul.cpp +++ b/modules/core/src/matmul.cpp @@ -63,7 +63,9 @@ GEMM_CopyBlock( const uchar* src, size_t src_step, for( ; size.height--; src += src_step, dst += dst_step ) { - for( j = 0; j <= size.width - 4; j += 4 ) + j=0; + #if CV_ENABLE_UNROLLED + for( ; j <= size.width - 4; j += 4 ) { int t0 = ((const int*)src)[j]; int t1 = ((const int*)src)[j+1]; @@ -74,7 +76,7 @@ GEMM_CopyBlock( const uchar* src, size_t src_step, ((int*)dst)[j+2] = t0; ((int*)dst)[j+3] = t1; } - + #endif for( ; j < size.width; j++ ) ((int*)dst)[j] = ((const int*)src)[j]; } @@ -237,15 +239,16 @@ GEMMSingleMul( const T* a_data, size_t a_step, c_data += c_step1 ) { WT s0(0), s1(0), s2(0), s3(0); - - for( k = 0; k <= n - 4; k += 4 ) + k = 0; + #if CV_ENABLE_UNROLLED + for( ; k <= n - 4; k += 4 ) { s0 += WT(a_data[k])*WT(b_data[k]); s1 += WT(a_data[k+1])*WT(b_data[k+1]); s2 += WT(a_data[k+2])*WT(b_data[k+2]); s3 += WT(a_data[k+3])*WT(b_data[k+3]); } - + #endif for( ; k < n; k++ ) s0 += WT(a_data[k])*WT(b_data[k]); s0 = (s0+s1+s2+s3)*alpha; @@ -342,8 +345,9 @@ GEMMSingleMul( const T* a_data, size_t a_step, for( k = 0; k < n; k++, b_data += b_step ) { WT al(a_data[k]); - - for( j = 0; j <= m - 4; j += 4 ) + j=0; + #if CV_ENABLE_UNROLLED + for(; j <= m - 4; j += 4 ) { WT t0 = d_buf[j] + WT(b_data[j])*al; WT t1 = d_buf[j+1] + WT(b_data[j+1])*al; @@ -354,7 +358,7 @@ GEMMSingleMul( const T* a_data, size_t a_step, d_buf[j+2] = t0; d_buf[j+3] = t1; } - + #endif for( ; j < m; j++ ) d_buf[j] += WT(b_data[j])*al; } @@ -509,7 +513,9 @@ GEMMStore( const T* c_data, size_t c_step, if( _c_data ) { c_data = _c_data; - for( j = 0; j <= d_size.width - 4; j += 4, c_data += 4*c_step1 ) + j=0; + #if CV_ENABLE_UNROLLED + for(; j <= d_size.width - 4; j += 4, c_data += 4*c_step1 ) { WT t0 = alpha*d_buf[j]; WT t1 = alpha*d_buf[j+1]; @@ -524,6 +530,7 @@ GEMMStore( const T* c_data, size_t c_step, d_data[j+2] = T(t0); d_data[j+3] = T(t1); } + #endif for( ; j < d_size.width; j++, c_data += c_step1 ) { WT t0 = alpha*d_buf[j]; @@ -532,7 +539,9 @@ GEMMStore( const T* c_data, size_t c_step, } else { - for( j = 0; j <= d_size.width - 4; j += 4 ) + j = 0; + #if CV_ENABLE_UNROLLED + for( ; j <= d_size.width - 4; j += 4 ) { WT t0 = alpha*d_buf[j]; WT t1 = alpha*d_buf[j+1]; @@ -543,6 +552,7 @@ GEMMStore( const T* c_data, size_t c_step, d_data[j+2] = T(t0); d_data[j+3] = T(t1); } + #endif for( ; j < d_size.width; j++ ) d_data[j] = T(alpha*d_buf[j]); } @@ -1987,6 +1997,7 @@ static void scaleAdd_32f(const float* src1, const float* src2, float* dst, } else #endif + //vz why do we need unroll here? for( ; i <= len - 4; i += 4 ) { float t0, t1; @@ -1997,7 +2008,7 @@ static void scaleAdd_32f(const float* src1, const float* src2, float* dst, t1 = src1[i+3]*alpha + src2[i+3]; dst[i+2] = t0; dst[i+3] = t1; } - for( ; i < len; i++ ) + for(; i < len; i++ ) dst[i] = src1[i]*alpha + src2[i]; } @@ -2024,6 +2035,7 @@ static void scaleAdd_64f(const double* src1, const double* src2, double* dst, } else #endif + //vz why do we need unroll here? for( ; i <= len - 4; i += 4 ) { double t0, t1; @@ -2034,7 +2046,7 @@ static void scaleAdd_64f(const double* src1, const double* src2, double* dst, t1 = src1[i+3]*alpha + src2[i+3]; dst[i+2] = t0; dst[i+3] = t1; } - for( ; i < len; i++ ) + for(; i < len; i++ ) dst[i] = src1[i]*alpha + src2[i]; } @@ -2198,9 +2210,12 @@ double cv::Mahalanobis( InputArray _v1, InputArray _v2, InputArray _icovar ) for( i = 0; i < len; i++, mat += matstep ) { double row_sum = 0; - for( j = 0; j <= len - 4; j += 4 ) + j = 0; + #if CV_ENABLE_UNROLLED + for(; j <= len - 4; j += 4 ) row_sum += diff[j]*mat[j] + diff[j+1]*mat[j+1] + diff[j+2]*mat[j+2] + diff[j+3]*mat[j+3]; + #endif for( ; j < len; j++ ) row_sum += diff[j]*mat[j]; result += row_sum * diff[i]; @@ -2226,9 +2241,12 @@ double cv::Mahalanobis( InputArray _v1, InputArray _v2, InputArray _icovar ) for( i = 0; i < len; i++, mat += matstep ) { double row_sum = 0; - for( j = 0; j <= len - 4; j += 4 ) + j = 0; + #if CV_ENABLE_UNROLLED + for(; j <= len - 4; j += 4 ) row_sum += diff[j]*mat[j] + diff[j+1]*mat[j+1] + diff[j+2]*mat[j+2] + diff[j+3]*mat[j+3]; + #endif for( ; j < len; j++ ) row_sum += diff[j]*mat[j]; result += row_sum * diff[i]; @@ -2574,9 +2592,11 @@ dotProd_(const T* src1, const T* src2, int len) { int i = 0; double result = 0; + #if CV_ENABLE_UNROLLED for( ; i <= len - 4; i += 4 ) result += (double)src1[i]*src2[i] + (double)src1[i+1]*src2[i+1] + (double)src1[i+2]*src2[i+2] + (double)src1[i+3]*src2[i+3]; + #endif for( ; i < len; i++ ) result += (double)src1[i]*src2[i]; diff --git a/modules/core/src/matrix.cpp b/modules/core/src/matrix.cpp index 6c53e6429d..cf1bbfe9a2 100644 --- a/modules/core/src/matrix.cpp +++ b/modules/core/src/matrix.cpp @@ -1657,9 +1657,10 @@ namespace cv template static void transpose_( const uchar* src, size_t sstep, uchar* dst, size_t dstep, Size sz ) { - int i, j, m = sz.width, n = sz.height; - - for( i = 0; i <= m - 4; i += 4 ) + int i=0, j, m = sz.width, n = sz.height; + + #if CV_ENABLE_UNROLLED + for(; i <= m - 4; i += 4 ) { T* d0 = (T*)(dst + dstep*i); T* d1 = (T*)(dst + dstep*(i+1)); @@ -1685,12 +1686,13 @@ transpose_( const uchar* src, size_t sstep, uchar* dst, size_t dstep, Size sz ) d0[j] = s0[0]; d1[j] = s0[1]; d2[j] = s0[2]; d3[j] = s0[3]; } } - + #endif for( ; i < m; i++ ) { T* d0 = (T*)(dst + dstep*i); - - for( j = 0; j <= n - 4; j += 4 ) + j = 0; + #if CV_ENABLE_UNROLLED + for(; j <= n - 4; j += 4 ) { const T* s0 = (const T*)(src + i*sizeof(T) + sstep*j); const T* s1 = (const T*)(src + i*sizeof(T) + sstep*(j+1)); @@ -1699,7 +1701,7 @@ transpose_( const uchar* src, size_t sstep, uchar* dst, size_t dstep, Size sz ) d0[j] = s0[0]; d0[j+1] = s1[0]; d0[j+2] = s2[0]; d0[j+3] = s3[0]; } - + #endif for( ; j < n; j++ ) { const T* s0 = (const T*)(src + i*sizeof(T) + j*sstep); @@ -1878,7 +1880,9 @@ reduceR_( const Mat& srcmat, Mat& dstmat ) for( ; --size.height; ) { src += srcstep; - for( i = 0; i <= size.width - 4; i += 4 ) + i = 0; + #if CV_ENABLE_UNROLLED + for(; i <= size.width - 4; i += 4 ) { WT s0, s1; s0 = op(buf[i], (WT)src[i]); @@ -1889,7 +1893,7 @@ reduceR_( const Mat& srcmat, Mat& dstmat ) s1 = op(buf[i+3], (WT)src[i+3]); buf[i+2] = s0; buf[i+3] = s1; } - + #endif for( ; i < size.width; i++ ) buf[i] = op(buf[i], (WT)src[i]); } @@ -2467,7 +2471,9 @@ double cv::kmeans( InputArray _data, int K, sample = data.ptr(i); k = labels[i]; float* center = centers.ptr(k); - for( j = 0; j <= dims - 4; j += 4 ) + j=0; + #if CV_ENABLE_UNROLLED + for(; j <= dims - 4; j += 4 ) { float t0 = center[j] + sample[j]; float t1 = center[j+1] + sample[j+1]; @@ -2481,6 +2487,7 @@ double cv::kmeans( InputArray _data, int K, center[j+2] = t0; center[j+3] = t1; } + #endif for( ; j < dims; j++ ) center[j] += sample[j]; counters[k]++; diff --git a/modules/core/src/stat.cpp b/modules/core/src/stat.cpp index 48f8e5aba9..3d6348ba8e 100644 --- a/modules/core/src/stat.cpp +++ b/modules/core/src/stat.cpp @@ -65,13 +65,16 @@ static int sum_(const T* src0, const uchar* mask, ST* dst, int len, int cn ) const T* src = src0; if( !mask ) { - int i; + int i=0; int k = cn % 4; if( k == 1 ) { ST s0 = dst[0]; - for( i = 0; i <= len - 4; i += 4, src += cn*4 ) + + #if CV_ENABLE_UNROLLED + for(; i <= len - 4; i += 4, src += cn*4 ) s0 += src[0] + src[cn] + src[cn*2] + src[cn*3]; + #endif for( ; i < len; i++, src += cn ) s0 += src[0]; dst[0] = s0; @@ -151,6 +154,7 @@ static int sum_(const T* src0, const uchar* mask, ST* dst, int len, int cn ) if( mask[i] ) { int k = 0; + #if CV_ENABLE_UNROLLED for( ; k <= cn - 4; k += 4 ) { ST s0, s1; @@ -161,6 +165,7 @@ static int sum_(const T* src0, const uchar* mask, ST* dst, int len, int cn ) s1 = dst[k+3] + src[k+3]; dst[k+2] = s0; dst[k+3] = s1; } + #endif for( ; k < cn; k++ ) dst[k] += src[k]; nzm++; @@ -205,9 +210,11 @@ static SumFunc sumTab[] = template static int countNonZero_(const T* src, int len ) { - int i, nz = 0; - for( i = 0; i <= len - 4; i += 4 ) + int i=0, nz = 0; + #if CV_ENABLE_UNROLLED + for(; i <= len - 4; i += 4 ) nz += (src[i] != 0) + (src[i+1] != 0) + (src[i+2] != 0) + (src[i+3] != 0); + #endif for( ; i < len; i++ ) nz += src[i] != 0; return nz; @@ -826,14 +833,15 @@ float normL2Sqr_(const float* a, const float* b, int n) } else #endif - { + //vz why do we need unroll here? no sse = no need to unroll + { for( ; j <= n - 4; j += 4 ) { float t0 = a[j] - b[j], t1 = a[j+1] - b[j+1], t2 = a[j+2] - b[j+2], t3 = a[j+3] - b[j+3]; d += t0*t0 + t1*t1 + t2*t2 + t3*t3; } } - + for( ; j < n; j++ ) { float t = a[j] - b[j]; @@ -866,6 +874,7 @@ float normL1_(const float* a, const float* b, int n) } else #endif + //vz no need to unroll here - if no sse { for( ; j <= n - 4; j += 4 ) { @@ -873,7 +882,7 @@ float normL1_(const float* a, const float* b, int n) std::abs(a[j+2] - b[j+2]) + std::abs(a[j+3] - b[j+3]); } } - + for( ; j < n; j++ ) d += std::abs(a[j] - b[j]); return d; @@ -906,6 +915,7 @@ int normL1_(const uchar* a, const uchar* b, int n) } else #endif + //vz why do we need unroll here? no sse = no unroll { for( ; j <= n - 4; j += 4 ) { @@ -913,7 +923,6 @@ int normL1_(const uchar* a, const uchar* b, int n) std::abs(a[j+2] - b[j+2]) + std::abs(a[j+3] - b[j+3]); } } - for( ; j < n; j++ ) d += std::abs(a[j] - b[j]); return d; @@ -997,9 +1006,11 @@ int normHamming(const uchar* a, const uchar* b, int n, int cellSize) else CV_Error( CV_StsBadSize, "bad cell size (not 1, 2 or 4) in normHamming" ); int i = 0, result = 0; + #if CV_ENABLE_UNROLLED for( ; i <= n - 4; i += 4 ) result += tab[a[i] ^ b[i]] + tab[a[i+1] ^ b[i+1]] + tab[a[i+2] ^ b[i+2]] + tab[a[i+3] ^ b[i+3]]; + #endif for( ; i < n; i++ ) result += tab[a[i] ^ b[i]]; return result;