CV_ENABLE_UNROLLED

pull/13383/head
Victoria Zhislina 13 years ago
parent 68a54e7f3c
commit fbdb93ec79
  1. 6
      modules/core/include/opencv2/core/internal.hpp
  2. 31
      modules/core/include/opencv2/core/operations.hpp
  3. 45
      modules/core/src/arithm.cpp
  4. 59
      modules/core/src/convert.cpp
  5. 2
      modules/core/src/copy.cpp
  6. 6
      modules/core/src/lapack.cpp
  7. 48
      modules/core/src/matmul.cpp
  8. 27
      modules/core/src/matrix.cpp
  9. 27
      modules/core/src/stat.cpp

@ -131,6 +131,12 @@ CV_INLINE IppiSize ippiSize(int width, int height)
#define CPU_HAS_NEON_FEATURE (false)
#endif
#ifdef CV_ICC
#define CV_ENABLE_UNROLLED 0
#else
#define CV_ENABLE_UNROLLED 1
#endif
#ifndef IPPI_CALL
#define IPPI_CALL(func) CV_Assert((func) >= 0)
#endif

@ -48,6 +48,7 @@
#include <limits.h>
#endif // SKIP_INCLUDES
#ifdef __cplusplus
/////// exchange-add operation for atomic operations on reference counters ///////
@ -903,12 +904,14 @@ template<typename _Tp, typename _AccTp> static inline
_AccTp normL2Sqr(const _Tp* a, int n)
{
_AccTp s = 0;
int i;
for( i = 0; i <= n - 4; i += 4 )
int i=0;
#if CV_ENABLE_UNROLLED
for( ; i <= n - 4; i += 4 )
{
_AccTp v0 = a[i], v1 = a[i+1], v2 = a[i+2], v3 = a[i+3];
s += v0*v0 + v1*v1 + v2*v2 + v3*v3;
}
#endif
for( ; i < n; i++ )
{
_AccTp v = a[i];
@ -922,12 +925,14 @@ template<typename _Tp, typename _AccTp> static inline
_AccTp normL1(const _Tp* a, int n)
{
_AccTp s = 0;
int i;
for( i = 0; i <= n - 4; i += 4 )
int i = 0;
#if CV_ENABLE_UNROLLED
for(; i <= n - 4; i += 4 )
{
s += (_AccTp)fast_abs(a[i]) + (_AccTp)fast_abs(a[i+1]) +
(_AccTp)fast_abs(a[i+2]) + (_AccTp)fast_abs(a[i+3]);
}
#endif
for( ; i < n; i++ )
s += fast_abs(a[i]);
return s;
@ -948,12 +953,14 @@ template<typename _Tp, typename _AccTp> static inline
_AccTp normL2Sqr(const _Tp* a, const _Tp* b, int n)
{
_AccTp s = 0;
int i;
for( i = 0; i <= n - 4; i += 4 )
int i= 0;
#if CV_ENABLE_UNROLLED
for(; i <= n - 4; i += 4 )
{
_AccTp v0 = a[i] - b[i], v1 = a[i+1] - b[i+1], v2 = a[i+2] - b[i+2], v3 = a[i+3] - b[i+3];
s += v0*v0 + v1*v1 + v2*v2 + v3*v3;
}
#endif
for( ; i < n; i++ )
{
_AccTp v = a[i] - b[i];
@ -986,12 +993,14 @@ template<typename _Tp, typename _AccTp> static inline
_AccTp normL1(const _Tp* a, const _Tp* b, int n)
{
_AccTp s = 0;
int i;
for( i = 0; i <= n - 4; i += 4 )
int i= 0;
#if CV_ENABLE_UNROLLED
for(; i <= n - 4; i += 4 )
{
_AccTp v0 = a[i] - b[i], v1 = a[i+1] - b[i+1], v2 = a[i+2] - b[i+2], v3 = a[i+3] - b[i+3];
s += std::abs(v0) + std::abs(v1) + std::abs(v2) + std::abs(v3);
}
#endif
for( ; i < n; i++ )
{
_AccTp v = a[i] - b[i];
@ -2422,14 +2431,16 @@ template<typename _Tp> inline typename DataType<_Tp>::work_type
dot(const Vector<_Tp>& v1, const Vector<_Tp>& v2)
{
typedef typename DataType<_Tp>::work_type _Tw;
size_t i, n = v1.size();
size_t i = 0, n = v1.size();
assert(v1.size() == v2.size());
_Tw s = 0;
const _Tp *ptr1 = &v1[0], *ptr2 = &v2[0];
for( i = 0; i <= n - 4; i += 4 )
#if CV_ENABLE_UNROLLED
for(; i <= n - 4; i += 4 )
s += (_Tw)ptr1[i]*ptr2[i] + (_Tw)ptr1[i+1]*ptr2[i+1] +
(_Tw)ptr1[i+2]*ptr2[i+2] + (_Tw)ptr1[i+3]*ptr2[i+3];
#endif
for( ; i < n; i++ )
s += (_Tw)ptr1[i]*ptr2[i];
return s;

@ -99,7 +99,7 @@ void vBinOp8(const T* src1, size_t step1, const T* src2, size_t step2, T* dst, s
}
}
#endif
#if CV_ENABLE_UNROLLED
for( ; x <= sz.width - 4; x += 4 )
{
T v0 = op(src1[x], src2[x]);
@ -109,7 +109,7 @@ void vBinOp8(const T* src1, size_t step1, const T* src2, size_t step2, T* dst, s
v1 = op(src1[x+3], src2[x+3]);
dst[x+2] = v0; dst[x+3] = v1;
}
#endif
for( ; x < sz.width; x++ )
dst[x] = op(src1[x], src2[x]);
}
@ -208,7 +208,7 @@ void vBinOp32s(const int* src1, size_t step1, const int* src2, size_t step2,
}
}
#endif
#if CV_ENABLE_UNROLLED
for( ; x <= sz.width - 4; x += 4 )
{
int v0 = op(src1[x], src2[x]);
@ -218,7 +218,7 @@ void vBinOp32s(const int* src1, size_t step1, const int* src2, size_t step2,
v1 = op(src1[x+3], src2[x+3]);
dst[x+2] = v0; dst[x+3] = v1;
}
#endif
for( ; x < sz.width; x++ )
dst[x] = op(src1[x], src2[x]);
}
@ -265,6 +265,7 @@ void vBinOp32f(const float* src1, size_t step1, const float* src2, size_t step2,
}
}
#endif
#if CV_ENABLE_UNROLLED
for( ; x <= sz.width - 4; x += 4 )
{
float v0 = op(src1[x], src2[x]);
@ -274,7 +275,7 @@ void vBinOp32f(const float* src1, size_t step1, const float* src2, size_t step2,
v1 = op(src1[x+3], src2[x+3]);
dst[x+2] = v0; dst[x+3] = v1;
}
#endif
for( ; x < sz.width; x++ )
dst[x] = op(src1[x], src2[x]);
}
@ -1508,8 +1509,9 @@ mul_( const T* src1, size_t step1, const T* src2, size_t step2,
{
for( ; size.height--; src1 += step1, src2 += step2, dst += step )
{
int i;
for( i = 0; i <= size.width - 4; i += 4 )
int i=0;
#if CV_ENABLE_UNROLLED
for(; i <= size.width - 4; i += 4 )
{
T t0;
T t1;
@ -1523,7 +1525,7 @@ mul_( const T* src1, size_t step1, const T* src2, size_t step2,
dst[i+2] = t0;
dst[i+3] = t1;
}
#endif
for( ; i < size.width; i++ )
dst[i] = saturate_cast<T>(src1[i] * src2[i]);
}
@ -1532,8 +1534,9 @@ mul_( const T* src1, size_t step1, const T* src2, size_t step2,
{
for( ; size.height--; src1 += step1, src2 += step2, dst += step )
{
int i;
for( i = 0; i <= size.width - 4; i += 4 )
int i = 0;
#if CV_ENABLE_UNROLLED
for(; i <= size.width - 4; i += 4 )
{
T t0 = saturate_cast<T>(scale*(WT)src1[i]*src2[i]);
T t1 = saturate_cast<T>(scale*(WT)src1[i+1]*src2[i+1]);
@ -1543,7 +1546,7 @@ mul_( const T* src1, size_t step1, const T* src2, size_t step2,
t1 = saturate_cast<T>(scale*(WT)src1[i+3]*src2[i+3]);
dst[i+2] = t0; dst[i+3] = t1;
}
#endif
for( ; i < size.width; i++ )
dst[i] = saturate_cast<T>(scale*(WT)src1[i]*src2[i]);
}
@ -1561,6 +1564,7 @@ div_( const T* src1, size_t step1, const T* src2, size_t step2,
for( ; size.height--; src1 += step1, src2 += step2, dst += step )
{
int i = 0;
#if CV_ENABLE_UNROLLED
for( ; i <= size.width - 4; i += 4 )
{
if( src2[i] != 0 && src2[i+1] != 0 && src2[i+2] != 0 && src2[i+3] != 0 )
@ -1590,7 +1594,7 @@ div_( const T* src1, size_t step1, const T* src2, size_t step2,
dst[i+2] = z2; dst[i+3] = z3;
}
}
#endif
for( ; i < size.width; i++ )
dst[i] = src2[i] != 0 ? saturate_cast<T>(src1[i]*scale/src2[i]) : 0;
}
@ -1606,6 +1610,7 @@ recip_( const T*, size_t, const T* src2, size_t step2,
for( ; size.height--; src2 += step2, dst += step )
{
int i = 0;
#if CV_ENABLE_UNROLLED
for( ; i <= size.width - 4; i += 4 )
{
if( src2[i] != 0 && src2[i+1] != 0 && src2[i+2] != 0 && src2[i+3] != 0 )
@ -1635,7 +1640,7 @@ recip_( const T*, size_t, const T* src2, size_t step2,
dst[i+2] = z2; dst[i+3] = z3;
}
}
#endif
for( ; i < size.width; i++ )
dst[i] = src2[i] != 0 ? saturate_cast<T>(scale/src2[i]) : 0;
}
@ -1834,6 +1839,7 @@ addWeighted_( const T* src1, size_t step1, const T* src2, size_t step2,
for( ; size.height--; src1 += step1, src2 += step2, dst += step )
{
int x = 0;
#if CV_ENABLE_UNROLLED
for( ; x <= size.width - 4; x += 4 )
{
T t0 = saturate_cast<T>(src1[x]*alpha + src2[x]*beta + gamma);
@ -1844,7 +1850,7 @@ addWeighted_( const T* src1, size_t step1, const T* src2, size_t step2,
t1 = saturate_cast<T>(src1[x+3]*alpha + src2[x+3]*beta + gamma);
dst[x+2] = t0; dst[x+3] = t1;
}
#endif
for( ; x < size.width; x++ )
dst[x] = saturate_cast<T>(src1[x]*alpha + src2[x]*beta + gamma);
}
@ -1891,6 +1897,7 @@ addWeighted8u( const uchar* src1, size_t step1,
}
}
#endif
#if CV_ENABLE_UNROLLED
for( ; x <= size.width - 4; x += 4 )
{
float t0, t1;
@ -1906,6 +1913,7 @@ addWeighted8u( const uchar* src1, size_t step1,
dst[x+2] = saturate_cast<uchar>(t0);
dst[x+3] = saturate_cast<uchar>(t1);
}
#endif
for( ; x < size.width; x++ )
{
@ -1994,6 +2002,7 @@ cmp_(const T* src1, size_t step1, const T* src2, size_t step2,
for( ; size.height--; src1 += step1, src2 += step2, dst += step )
{
int x = 0;
#if CV_ENABLE_UNROLLED
for( ; x <= size.width - 4; x += 4 )
{
int t0, t1;
@ -2004,7 +2013,7 @@ cmp_(const T* src1, size_t step1, const T* src2, size_t step2,
t1 = -(src1[x+3] > src2[x+3]) ^ m;
dst[x+2] = (uchar)t0; dst[x+3] = (uchar)t1;
}
#endif
for( ; x < size.width; x++ )
dst[x] = (uchar)(-(src1[x] > src2[x]) ^ m);
}
@ -2015,6 +2024,7 @@ cmp_(const T* src1, size_t step1, const T* src2, size_t step2,
for( ; size.height--; src1 += step1, src2 += step2, dst += step )
{
int x = 0;
#if CV_ENABLE_UNROLLED
for( ; x <= size.width - 4; x += 4 )
{
int t0, t1;
@ -2025,7 +2035,7 @@ cmp_(const T* src1, size_t step1, const T* src2, size_t step2,
t1 = -(src1[x+3] == src2[x+3]) ^ m;
dst[x+2] = (uchar)t0; dst[x+3] = (uchar)t1;
}
#endif
for( ; x < size.width; x++ )
dst[x] = (uchar)(-(src1[x] == src2[x]) ^ m);
}
@ -2382,6 +2392,7 @@ inRange_(const T* src1, size_t step1, const T* src2, size_t step2,
for( ; size.height--; src1 += step1, src2 += step2, src3 += step3, dst += step )
{
int x = 0;
#if CV_ENABLE_UNROLLED
for( ; x <= size.width - 4; x += 4 )
{
int t0, t1;
@ -2392,7 +2403,7 @@ inRange_(const T* src1, size_t step1, const T* src2, size_t step2,
t1 = src2[x+3] <= src1[x+3] && src1[x+3] <= src3[x+3];
dst[x+2] = (uchar)-t0; dst[x+3] = (uchar)-t1;
}
#endif
for( ; x < size.width; x++ )
dst[x] = (uchar)-(src2[x] <= src1[x] && src1[x] <= src3[x]);
}

@ -578,7 +578,8 @@ cvtScaleAbs_( const T* src, size_t sstep,
dst[x] = saturate_cast<DT>(std::abs(src[x]*scale + shift));
}
}
template<typename T, typename DT, typename WT> static void
cvtScale_( const T* src, size_t sstep,
DT* dst, size_t dstep, Size size,
@ -590,6 +591,7 @@ cvtScale_( const T* src, size_t sstep,
for( ; size.height--; src += sstep, dst += dstep )
{
int x = 0;
#if CV_ENABLE_UNROLLED
for( ; x <= size.width - 4; x += 4 )
{
DT t0, t1;
@ -600,6 +602,7 @@ cvtScale_( const T* src, size_t sstep,
t1 = saturate_cast<DT>(src[x+3]*scale + shift);
dst[x+2] = t0; dst[x+3] = t1;
}
#endif
for( ; x < size.width; x++ )
dst[x] = saturate_cast<DT>(src[x]*scale + shift);
@ -655,22 +658,54 @@ cvt_( const T* src, size_t sstep,
for( ; size.height--; src += sstep, dst += dstep )
{
int x = 0;
for( ; x <= size.width - 4; x += 4 )
{
DT t0, t1;
t0 = saturate_cast<DT>(src[x]);
t1 = saturate_cast<DT>(src[x+1]);
dst[x] = t0; dst[x+1] = t1;
t0 = saturate_cast<DT>(src[x+2]);
t1 = saturate_cast<DT>(src[x+3]);
dst[x+2] = t0; dst[x+3] = t1;
}
for( ; x <= size.width - 4; x += 4 )
{
DT t0, t1;
t0 = saturate_cast<DT>(src[x]);
t1 = saturate_cast<DT>(src[x+1]);
dst[x] = t0; dst[x+1] = t1;
t0 = saturate_cast<DT>(src[x+2]);
t1 = saturate_cast<DT>(src[x+3]);
dst[x+2] = t0; dst[x+3] = t1;
}
for( ; x < size.width; x++ )
dst[x] = saturate_cast<DT>(src[x]);
}
}
//vz optimized template specialization, test Core_ConvertScale/ElemWiseTest
template<> static void
cvt_<float, short>( const float* src, size_t sstep,
short* dst, size_t dstep, Size size )
{
sstep /= sizeof(src[0]);
dstep /= sizeof(dst[0]);
for( ; size.height--; src += sstep, dst += dstep )
{
int x = 0;
#if CV_SSE2
if(USE_SSE2){
for( ; x <= size.width - 8; x += 8 )
{
__m128 src128 = _mm_loadu_ps (src + x);
__m128i src_int128 = _mm_cvtps_epi32 (src128);
src128 = _mm_loadu_ps (src + x + 4);
__m128i src1_int128 = _mm_cvtps_epi32 (src128);
src1_int128 = _mm_packs_epi32(src_int128, src1_int128);
_mm_storeu_si128((__m128i*)(dst + x),src1_int128);
}
}
#endif
for( ; x < size.width; x++ )
dst[x] = (src[x]);
}
}
template<typename T> static void
cpy_( const T* src, size_t sstep, T* dst, size_t dstep, Size size )
{

@ -59,6 +59,7 @@ copyMask_(const uchar* _src, size_t sstep, const uchar* mask, size_t mstep, ucha
const T* src = (const T*)_src;
T* dst = (T*)_dst;
int x = 0;
#if CV_ENABLE_UNROLLED
for( ; x <= size.width - 4; x += 4 )
{
if( mask[x] )
@ -70,6 +71,7 @@ copyMask_(const uchar* _src, size_t sstep, const uchar* mask, size_t mstep, ucha
if( mask[x+3] )
dst[x+3] = src[x+3];
}
#endif
for( ; x < size.width; x++ )
if( mask[x] )
dst[x] = src[x];

@ -741,7 +741,9 @@ MatrAXPY( int m, int n, const T1* x, int dx,
for( i = 0; i < m; i++, x += dx, y += dy )
{
T2 s = a[i*inca];
for( j = 0; j <= n - 4; j += 4 )
j=0;
#if CV_ENABLE_UNROLLED
for(; j <= n - 4; j += 4 )
{
T3 t0 = (T3)(y[j] + s*x[j]);
T3 t1 = (T3)(y[j+1] + s*x[j+1]);
@ -752,7 +754,7 @@ MatrAXPY( int m, int n, const T1* x, int dx,
y[j+2] = t0;
y[j+3] = t1;
}
#endif
for( ; j < n; j++ )
y[j] = (T3)(y[j] + s*x[j]);
}

@ -63,7 +63,9 @@ GEMM_CopyBlock( const uchar* src, size_t src_step,
for( ; size.height--; src += src_step, dst += dst_step )
{
for( j = 0; j <= size.width - 4; j += 4 )
j=0;
#if CV_ENABLE_UNROLLED
for( ; j <= size.width - 4; j += 4 )
{
int t0 = ((const int*)src)[j];
int t1 = ((const int*)src)[j+1];
@ -74,7 +76,7 @@ GEMM_CopyBlock( const uchar* src, size_t src_step,
((int*)dst)[j+2] = t0;
((int*)dst)[j+3] = t1;
}
#endif
for( ; j < size.width; j++ )
((int*)dst)[j] = ((const int*)src)[j];
}
@ -237,15 +239,16 @@ GEMMSingleMul( const T* a_data, size_t a_step,
c_data += c_step1 )
{
WT s0(0), s1(0), s2(0), s3(0);
for( k = 0; k <= n - 4; k += 4 )
k = 0;
#if CV_ENABLE_UNROLLED
for( ; k <= n - 4; k += 4 )
{
s0 += WT(a_data[k])*WT(b_data[k]);
s1 += WT(a_data[k+1])*WT(b_data[k+1]);
s2 += WT(a_data[k+2])*WT(b_data[k+2]);
s3 += WT(a_data[k+3])*WT(b_data[k+3]);
}
#endif
for( ; k < n; k++ )
s0 += WT(a_data[k])*WT(b_data[k]);
s0 = (s0+s1+s2+s3)*alpha;
@ -342,8 +345,9 @@ GEMMSingleMul( const T* a_data, size_t a_step,
for( k = 0; k < n; k++, b_data += b_step )
{
WT al(a_data[k]);
for( j = 0; j <= m - 4; j += 4 )
j=0;
#if CV_ENABLE_UNROLLED
for(; j <= m - 4; j += 4 )
{
WT t0 = d_buf[j] + WT(b_data[j])*al;
WT t1 = d_buf[j+1] + WT(b_data[j+1])*al;
@ -354,7 +358,7 @@ GEMMSingleMul( const T* a_data, size_t a_step,
d_buf[j+2] = t0;
d_buf[j+3] = t1;
}
#endif
for( ; j < m; j++ )
d_buf[j] += WT(b_data[j])*al;
}
@ -509,7 +513,9 @@ GEMMStore( const T* c_data, size_t c_step,
if( _c_data )
{
c_data = _c_data;
for( j = 0; j <= d_size.width - 4; j += 4, c_data += 4*c_step1 )
j=0;
#if CV_ENABLE_UNROLLED
for(; j <= d_size.width - 4; j += 4, c_data += 4*c_step1 )
{
WT t0 = alpha*d_buf[j];
WT t1 = alpha*d_buf[j+1];
@ -524,6 +530,7 @@ GEMMStore( const T* c_data, size_t c_step,
d_data[j+2] = T(t0);
d_data[j+3] = T(t1);
}
#endif
for( ; j < d_size.width; j++, c_data += c_step1 )
{
WT t0 = alpha*d_buf[j];
@ -532,7 +539,9 @@ GEMMStore( const T* c_data, size_t c_step,
}
else
{
for( j = 0; j <= d_size.width - 4; j += 4 )
j = 0;
#if CV_ENABLE_UNROLLED
for( ; j <= d_size.width - 4; j += 4 )
{
WT t0 = alpha*d_buf[j];
WT t1 = alpha*d_buf[j+1];
@ -543,6 +552,7 @@ GEMMStore( const T* c_data, size_t c_step,
d_data[j+2] = T(t0);
d_data[j+3] = T(t1);
}
#endif
for( ; j < d_size.width; j++ )
d_data[j] = T(alpha*d_buf[j]);
}
@ -1987,6 +1997,7 @@ static void scaleAdd_32f(const float* src1, const float* src2, float* dst,
}
else
#endif
//vz why do we need unroll here?
for( ; i <= len - 4; i += 4 )
{
float t0, t1;
@ -1997,7 +2008,7 @@ static void scaleAdd_32f(const float* src1, const float* src2, float* dst,
t1 = src1[i+3]*alpha + src2[i+3];
dst[i+2] = t0; dst[i+3] = t1;
}
for( ; i < len; i++ )
for(; i < len; i++ )
dst[i] = src1[i]*alpha + src2[i];
}
@ -2024,6 +2035,7 @@ static void scaleAdd_64f(const double* src1, const double* src2, double* dst,
}
else
#endif
//vz why do we need unroll here?
for( ; i <= len - 4; i += 4 )
{
double t0, t1;
@ -2034,7 +2046,7 @@ static void scaleAdd_64f(const double* src1, const double* src2, double* dst,
t1 = src1[i+3]*alpha + src2[i+3];
dst[i+2] = t0; dst[i+3] = t1;
}
for( ; i < len; i++ )
for(; i < len; i++ )
dst[i] = src1[i]*alpha + src2[i];
}
@ -2198,9 +2210,12 @@ double cv::Mahalanobis( InputArray _v1, InputArray _v2, InputArray _icovar )
for( i = 0; i < len; i++, mat += matstep )
{
double row_sum = 0;
for( j = 0; j <= len - 4; j += 4 )
j = 0;
#if CV_ENABLE_UNROLLED
for(; j <= len - 4; j += 4 )
row_sum += diff[j]*mat[j] + diff[j+1]*mat[j+1] +
diff[j+2]*mat[j+2] + diff[j+3]*mat[j+3];
#endif
for( ; j < len; j++ )
row_sum += diff[j]*mat[j];
result += row_sum * diff[i];
@ -2226,9 +2241,12 @@ double cv::Mahalanobis( InputArray _v1, InputArray _v2, InputArray _icovar )
for( i = 0; i < len; i++, mat += matstep )
{
double row_sum = 0;
for( j = 0; j <= len - 4; j += 4 )
j = 0;
#if CV_ENABLE_UNROLLED
for(; j <= len - 4; j += 4 )
row_sum += diff[j]*mat[j] + diff[j+1]*mat[j+1] +
diff[j+2]*mat[j+2] + diff[j+3]*mat[j+3];
#endif
for( ; j < len; j++ )
row_sum += diff[j]*mat[j];
result += row_sum * diff[i];
@ -2574,9 +2592,11 @@ dotProd_(const T* src1, const T* src2, int len)
{
int i = 0;
double result = 0;
#if CV_ENABLE_UNROLLED
for( ; i <= len - 4; i += 4 )
result += (double)src1[i]*src2[i] + (double)src1[i+1]*src2[i+1] +
(double)src1[i+2]*src2[i+2] + (double)src1[i+3]*src2[i+3];
#endif
for( ; i < len; i++ )
result += (double)src1[i]*src2[i];

@ -1657,9 +1657,10 @@ namespace cv
template<typename T> static void
transpose_( const uchar* src, size_t sstep, uchar* dst, size_t dstep, Size sz )
{
int i, j, m = sz.width, n = sz.height;
for( i = 0; i <= m - 4; i += 4 )
int i=0, j, m = sz.width, n = sz.height;
#if CV_ENABLE_UNROLLED
for(; i <= m - 4; i += 4 )
{
T* d0 = (T*)(dst + dstep*i);
T* d1 = (T*)(dst + dstep*(i+1));
@ -1685,12 +1686,13 @@ transpose_( const uchar* src, size_t sstep, uchar* dst, size_t dstep, Size sz )
d0[j] = s0[0]; d1[j] = s0[1]; d2[j] = s0[2]; d3[j] = s0[3];
}
}
#endif
for( ; i < m; i++ )
{
T* d0 = (T*)(dst + dstep*i);
for( j = 0; j <= n - 4; j += 4 )
j = 0;
#if CV_ENABLE_UNROLLED
for(; j <= n - 4; j += 4 )
{
const T* s0 = (const T*)(src + i*sizeof(T) + sstep*j);
const T* s1 = (const T*)(src + i*sizeof(T) + sstep*(j+1));
@ -1699,7 +1701,7 @@ transpose_( const uchar* src, size_t sstep, uchar* dst, size_t dstep, Size sz )
d0[j] = s0[0]; d0[j+1] = s1[0]; d0[j+2] = s2[0]; d0[j+3] = s3[0];
}
#endif
for( ; j < n; j++ )
{
const T* s0 = (const T*)(src + i*sizeof(T) + j*sstep);
@ -1878,7 +1880,9 @@ reduceR_( const Mat& srcmat, Mat& dstmat )
for( ; --size.height; )
{
src += srcstep;
for( i = 0; i <= size.width - 4; i += 4 )
i = 0;
#if CV_ENABLE_UNROLLED
for(; i <= size.width - 4; i += 4 )
{
WT s0, s1;
s0 = op(buf[i], (WT)src[i]);
@ -1889,7 +1893,7 @@ reduceR_( const Mat& srcmat, Mat& dstmat )
s1 = op(buf[i+3], (WT)src[i+3]);
buf[i+2] = s0; buf[i+3] = s1;
}
#endif
for( ; i < size.width; i++ )
buf[i] = op(buf[i], (WT)src[i]);
}
@ -2467,7 +2471,9 @@ double cv::kmeans( InputArray _data, int K,
sample = data.ptr<float>(i);
k = labels[i];
float* center = centers.ptr<float>(k);
for( j = 0; j <= dims - 4; j += 4 )
j=0;
#if CV_ENABLE_UNROLLED
for(; j <= dims - 4; j += 4 )
{
float t0 = center[j] + sample[j];
float t1 = center[j+1] + sample[j+1];
@ -2481,6 +2487,7 @@ double cv::kmeans( InputArray _data, int K,
center[j+2] = t0;
center[j+3] = t1;
}
#endif
for( ; j < dims; j++ )
center[j] += sample[j];
counters[k]++;

@ -65,13 +65,16 @@ static int sum_(const T* src0, const uchar* mask, ST* dst, int len, int cn )
const T* src = src0;
if( !mask )
{
int i;
int i=0;
int k = cn % 4;
if( k == 1 )
{
ST s0 = dst[0];
for( i = 0; i <= len - 4; i += 4, src += cn*4 )
#if CV_ENABLE_UNROLLED
for(; i <= len - 4; i += 4, src += cn*4 )
s0 += src[0] + src[cn] + src[cn*2] + src[cn*3];
#endif
for( ; i < len; i++, src += cn )
s0 += src[0];
dst[0] = s0;
@ -151,6 +154,7 @@ static int sum_(const T* src0, const uchar* mask, ST* dst, int len, int cn )
if( mask[i] )
{
int k = 0;
#if CV_ENABLE_UNROLLED
for( ; k <= cn - 4; k += 4 )
{
ST s0, s1;
@ -161,6 +165,7 @@ static int sum_(const T* src0, const uchar* mask, ST* dst, int len, int cn )
s1 = dst[k+3] + src[k+3];
dst[k+2] = s0; dst[k+3] = s1;
}
#endif
for( ; k < cn; k++ )
dst[k] += src[k];
nzm++;
@ -205,9 +210,11 @@ static SumFunc sumTab[] =
template<typename T>
static int countNonZero_(const T* src, int len )
{
int i, nz = 0;
for( i = 0; i <= len - 4; i += 4 )
int i=0, nz = 0;
#if CV_ENABLE_UNROLLED
for(; i <= len - 4; i += 4 )
nz += (src[i] != 0) + (src[i+1] != 0) + (src[i+2] != 0) + (src[i+3] != 0);
#endif
for( ; i < len; i++ )
nz += src[i] != 0;
return nz;
@ -826,14 +833,15 @@ float normL2Sqr_(const float* a, const float* b, int n)
}
else
#endif
{
//vz why do we need unroll here? no sse = no need to unroll
{
for( ; j <= n - 4; j += 4 )
{
float t0 = a[j] - b[j], t1 = a[j+1] - b[j+1], t2 = a[j+2] - b[j+2], t3 = a[j+3] - b[j+3];
d += t0*t0 + t1*t1 + t2*t2 + t3*t3;
}
}
for( ; j < n; j++ )
{
float t = a[j] - b[j];
@ -866,6 +874,7 @@ float normL1_(const float* a, const float* b, int n)
}
else
#endif
//vz no need to unroll here - if no sse
{
for( ; j <= n - 4; j += 4 )
{
@ -873,7 +882,7 @@ float normL1_(const float* a, const float* b, int n)
std::abs(a[j+2] - b[j+2]) + std::abs(a[j+3] - b[j+3]);
}
}
for( ; j < n; j++ )
d += std::abs(a[j] - b[j]);
return d;
@ -906,6 +915,7 @@ int normL1_(const uchar* a, const uchar* b, int n)
}
else
#endif
//vz why do we need unroll here? no sse = no unroll
{
for( ; j <= n - 4; j += 4 )
{
@ -913,7 +923,6 @@ int normL1_(const uchar* a, const uchar* b, int n)
std::abs(a[j+2] - b[j+2]) + std::abs(a[j+3] - b[j+3]);
}
}
for( ; j < n; j++ )
d += std::abs(a[j] - b[j]);
return d;
@ -997,9 +1006,11 @@ int normHamming(const uchar* a, const uchar* b, int n, int cellSize)
else
CV_Error( CV_StsBadSize, "bad cell size (not 1, 2 or 4) in normHamming" );
int i = 0, result = 0;
#if CV_ENABLE_UNROLLED
for( ; i <= n - 4; i += 4 )
result += tab[a[i] ^ b[i]] + tab[a[i+1] ^ b[i+1]] +
tab[a[i+2] ^ b[i+2]] + tab[a[i+3] ^ b[i+3]];
#endif
for( ; i < n; i++ )
result += tab[a[i] ^ b[i]];
return result;

Loading…
Cancel
Save