Merge pull request #15555 from ChipKerchner:flipVectorize

* Vectorize flipHoriz and flipVert functions.

* Change v_load_mirror_1 to use vec_revb for VSX

* Only use vec_revb in ISA3.0

* Removing vec_revb code since some of the older compilers don't fully support it.

* Use new v_reverse intrinsic and cleanup code.

* Ensure there are no alignment issues with copies
pull/15838/head
Chip Kerchner 5 years ago committed by Alexander Alekhin
parent cec7cc037b
commit ed7e4273cd
  1. 213
      modules/core/src/copy.cpp

@ -563,25 +563,206 @@ Mat& Mat::setTo(InputArray _value, InputArray _mask)
return *this;
}
#if CV_SIMD128
template<typename V> CV_ALWAYS_INLINE void flipHoriz_single( const uchar* src, size_t sstep, uchar* dst, size_t dstep, Size size, size_t esz )
{
typedef typename V::lane_type T;
int end = (int)(size.width*esz);
int width = (end + 1)/2;
int width_1 = width & -v_uint8x16::nlanes;
int i, j;
for( ; size.height--; src += sstep, dst += dstep )
{
for( i = 0, j = end; i < width_1; i += v_uint8x16::nlanes, j -= v_uint8x16::nlanes )
{
V t0, t1;
t0 = v_load((T*)((uchar*)src + i));
t1 = v_load((T*)((uchar*)src + j - v_uint8x16::nlanes));
t0 = v_reverse(t0);
t1 = v_reverse(t1);
v_store((T*)(dst + j - v_uint8x16::nlanes), t0);
v_store((T*)(dst + i), t1);
}
if (((size_t)src|(size_t)dst) % sizeof(T) == 0)
{
for ( ; i < width; i += sizeof(T), j -= sizeof(T) )
{
T t0, t1;
t0 = *((T*)((uchar*)src + i));
t1 = *((T*)((uchar*)src + j - sizeof(T)));
*((T*)(dst + j - sizeof(T))) = t0;
*((T*)(dst + i)) = t1;
}
}
else
{
for ( ; i < width; i += sizeof(T), j -= sizeof(T) )
{
for (int k = 0; k < (int)sizeof(T); k++)
{
uchar t0, t1;
t0 = *((uchar*)src + i + k);
t1 = *((uchar*)src + j + k - sizeof(T));
*(dst + j + k - sizeof(T)) = t0;
*(dst + i + k) = t1;
}
}
}
}
}
template<typename T1, typename T2> CV_ALWAYS_INLINE void flipHoriz_double( const uchar* src, size_t sstep, uchar* dst, size_t dstep, Size size, size_t esz )
{
int end = (int)(size.width*esz);
int width = (end + 1)/2;
for( ; size.height--; src += sstep, dst += dstep )
{
for ( int i = 0, j = end; i < width; i += sizeof(T1) + sizeof(T2), j -= sizeof(T1) + sizeof(T2) )
{
T1 t0, t1;
T2 t2, t3;
t0 = *((T1*)((uchar*)src + i));
t2 = *((T2*)((uchar*)src + i + sizeof(T1)));
t1 = *((T1*)((uchar*)src + j - sizeof(T1) - sizeof(T2)));
t3 = *((T2*)((uchar*)src + j - sizeof(T2)));
*((T1*)(dst + j - sizeof(T1) - sizeof(T2))) = t0;
*((T2*)(dst + j - sizeof(T2))) = t2;
*((T1*)(dst + i)) = t1;
*((T2*)(dst + i + sizeof(T1))) = t3;
}
}
}
#endif
static void
flipHoriz( const uchar* src, size_t sstep, uchar* dst, size_t dstep, Size size, size_t esz )
{
int i, j, limit = (int)(((size.width + 1)/2)*esz);
AutoBuffer<int> _tab(size.width*esz);
int* tab = _tab.data();
#if CV_SIMD
if (esz == 2 * v_uint8x16::nlanes)
{
int end = (int)(size.width*esz);
int width = end/2;
for( i = 0; i < size.width; i++ )
for( size_t k = 0; k < esz; k++ )
tab[i*esz + k] = (int)((size.width - i - 1)*esz + k);
for( ; size.height--; src += sstep, dst += dstep )
{
for( int i = 0, j = end - 2 * v_uint8x16::nlanes; i < width; i += 2 * v_uint8x16::nlanes, j -= 2 * v_uint8x16::nlanes )
{
#if CV_SIMD256
v_uint8x32 t0, t1;
for( ; size.height--; src += sstep, dst += dstep )
t0 = v256_load((uchar*)src + i);
t1 = v256_load((uchar*)src + j);
v_store(dst + j, t0);
v_store(dst + i, t1);
#else
v_uint8x16 t0, t1, t2, t3;
t0 = v_load((uchar*)src + i);
t1 = v_load((uchar*)src + i + v_uint8x16::nlanes);
t2 = v_load((uchar*)src + j);
t3 = v_load((uchar*)src + j + v_uint8x16::nlanes);
v_store(dst + j, t0);
v_store(dst + j + v_uint8x16::nlanes, t1);
v_store(dst + i, t2);
v_store(dst + i + v_uint8x16::nlanes, t3);
#endif
}
}
}
else if (esz == v_uint8x16::nlanes)
{
for( i = 0; i < limit; i++ )
int end = (int)(size.width*esz);
int width = end/2;
for( ; size.height--; src += sstep, dst += dstep )
{
for( int i = 0, j = end - v_uint8x16::nlanes; i < width; i += v_uint8x16::nlanes, j -= v_uint8x16::nlanes )
{
v_uint8x16 t0, t1;
t0 = v_load((uchar*)src + i);
t1 = v_load((uchar*)src + j);
v_store(dst + j, t0);
v_store(dst + i, t1);
}
}
}
else if (esz == 8)
{
flipHoriz_single<v_uint64x2>(src, sstep, dst, dstep, size, esz);
}
else if (esz == 4)
{
flipHoriz_single<v_uint32x4>(src, sstep, dst, dstep, size, esz);
}
else if (esz == 2)
{
flipHoriz_single<v_uint16x8>(src, sstep, dst, dstep, size, esz);
}
else if (esz == 1)
{
flipHoriz_single<v_uint8x16>(src, sstep, dst, dstep, size, esz);
}
else if (esz == 24)
{
int end = (int)(size.width*esz);
int width = (end + 1)/2;
for( ; size.height--; src += sstep, dst += dstep )
{
j = tab[i];
uchar t0 = src[i], t1 = src[j];
dst[i] = t1; dst[j] = t0;
for ( int i = 0, j = end; i < width; i += v_uint8x16::nlanes + sizeof(uint64_t), j -= v_uint8x16::nlanes + sizeof(uint64_t) )
{
v_uint8x16 t0, t1;
uint64_t t2, t3;
t0 = v_load((uchar*)src + i);
t2 = *((uint64_t*)((uchar*)src + i + v_uint8x16::nlanes));
t1 = v_load((uchar*)src + j - v_uint8x16::nlanes - sizeof(uint64_t));
t3 = *((uint64_t*)((uchar*)src + j - sizeof(uint64_t)));
v_store(dst + j - v_uint8x16::nlanes - sizeof(uint64_t), t0);
*((uint64_t*)(dst + j - sizeof(uint64_t))) = t2;
v_store(dst + i, t1);
*((uint64_t*)(dst + i + v_uint8x16::nlanes)) = t3;
}
}
}
else if (esz == 12)
{
flipHoriz_double<uint64_t,uint>(src, sstep, dst, dstep, size, esz);
}
else if (esz == 6)
{
flipHoriz_double<uint,ushort>(src, sstep, dst, dstep, size, esz);
}
else if (esz == 3)
{
flipHoriz_double<ushort,uchar>(src, sstep, dst, dstep, size, esz);
}
else
#endif
{
int i, j, limit = (int)(((size.width + 1)/2)*esz);
AutoBuffer<int> _tab(size.width*esz);
int* tab = _tab.data();
for( i = 0; i < size.width; i++ )
for( size_t k = 0; k < esz; k++ )
tab[i*esz + k] = (int)((size.width - i - 1)*esz + k);
for( ; size.height--; src += sstep, dst += dstep )
{
for( i = 0; i < limit; i++ )
{
j = tab[i];
uchar t0 = src[i], t1 = src[j];
dst[i] = t1; dst[j] = t0;
}
}
}
}
@ -597,6 +778,16 @@ flipVert( const uchar* src0, size_t sstep, uchar* dst0, size_t dstep, Size size,
dst0 += dstep, dst1 -= dstep )
{
int i = 0;
#if CV_SIMD
for( ; i <= size.width - (v_int32::nlanes * 4); i += v_int32::nlanes * 4 )
{
v_int32 t0 = vx_load((int*)(src0 + i));
v_int32 t1 = vx_load((int*)(src1 + i));
vx_store((int*)(dst0 + i), t1);
vx_store((int*)(dst1 + i), t0);
}
#endif
if( ((size_t)src0|(size_t)dst0|(size_t)src1|(size_t)dst1) % sizeof(int) == 0 )
{
for( ; i <= size.width - 16; i += 16 )

Loading…
Cancel
Save