@ -563,25 +563,206 @@ Mat& Mat::setTo(InputArray _value, InputArray _mask)
return * this ;
}
# if CV_SIMD128
template < typename V > CV_ALWAYS_INLINE void flipHoriz_single ( const uchar * src , size_t sstep , uchar * dst , size_t dstep , Size size , size_t esz )
{
typedef typename V : : lane_type T ;
int end = ( int ) ( size . width * esz ) ;
int width = ( end + 1 ) / 2 ;
int width_1 = width & - v_uint8x16 : : nlanes ;
int i , j ;
for ( ; size . height - - ; src + = sstep , dst + = dstep )
{
for ( i = 0 , j = end ; i < width_1 ; i + = v_uint8x16 : : nlanes , j - = v_uint8x16 : : nlanes )
{
V t0 , t1 ;
t0 = v_load ( ( T * ) ( ( uchar * ) src + i ) ) ;
t1 = v_load ( ( T * ) ( ( uchar * ) src + j - v_uint8x16 : : nlanes ) ) ;
t0 = v_reverse ( t0 ) ;
t1 = v_reverse ( t1 ) ;
v_store ( ( T * ) ( dst + j - v_uint8x16 : : nlanes ) , t0 ) ;
v_store ( ( T * ) ( dst + i ) , t1 ) ;
}
if ( ( ( size_t ) src | ( size_t ) dst ) % sizeof ( T ) = = 0 )
{
for ( ; i < width ; i + = sizeof ( T ) , j - = sizeof ( T ) )
{
T t0 , t1 ;
t0 = * ( ( T * ) ( ( uchar * ) src + i ) ) ;
t1 = * ( ( T * ) ( ( uchar * ) src + j - sizeof ( T ) ) ) ;
* ( ( T * ) ( dst + j - sizeof ( T ) ) ) = t0 ;
* ( ( T * ) ( dst + i ) ) = t1 ;
}
}
else
{
for ( ; i < width ; i + = sizeof ( T ) , j - = sizeof ( T ) )
{
for ( int k = 0 ; k < ( int ) sizeof ( T ) ; k + + )
{
uchar t0 , t1 ;
t0 = * ( ( uchar * ) src + i + k ) ;
t1 = * ( ( uchar * ) src + j + k - sizeof ( T ) ) ;
* ( dst + j + k - sizeof ( T ) ) = t0 ;
* ( dst + i + k ) = t1 ;
}
}
}
}
}
template < typename T1 , typename T2 > CV_ALWAYS_INLINE void flipHoriz_double ( const uchar * src , size_t sstep , uchar * dst , size_t dstep , Size size , size_t esz )
{
int end = ( int ) ( size . width * esz ) ;
int width = ( end + 1 ) / 2 ;
for ( ; size . height - - ; src + = sstep , dst + = dstep )
{
for ( int i = 0 , j = end ; i < width ; i + = sizeof ( T1 ) + sizeof ( T2 ) , j - = sizeof ( T1 ) + sizeof ( T2 ) )
{
T1 t0 , t1 ;
T2 t2 , t3 ;
t0 = * ( ( T1 * ) ( ( uchar * ) src + i ) ) ;
t2 = * ( ( T2 * ) ( ( uchar * ) src + i + sizeof ( T1 ) ) ) ;
t1 = * ( ( T1 * ) ( ( uchar * ) src + j - sizeof ( T1 ) - sizeof ( T2 ) ) ) ;
t3 = * ( ( T2 * ) ( ( uchar * ) src + j - sizeof ( T2 ) ) ) ;
* ( ( T1 * ) ( dst + j - sizeof ( T1 ) - sizeof ( T2 ) ) ) = t0 ;
* ( ( T2 * ) ( dst + j - sizeof ( T2 ) ) ) = t2 ;
* ( ( T1 * ) ( dst + i ) ) = t1 ;
* ( ( T2 * ) ( dst + i + sizeof ( T1 ) ) ) = t3 ;
}
}
}
# endif
static void
flipHoriz ( const uchar * src , size_t sstep , uchar * dst , size_t dstep , Size size , size_t esz )
{
int i , j , limit = ( int ) ( ( ( size . width + 1 ) / 2 ) * esz ) ;
AutoBuffer < int > _tab ( size . width * esz ) ;
int * tab = _tab . data ( ) ;
# if CV_SIMD
if ( esz = = 2 * v_uint8x16 : : nlanes )
{
int end = ( int ) ( size . width * esz ) ;
int width = end / 2 ;
for ( i = 0 ; i < size . width ; i + + )
for ( size_t k = 0 ; k < esz ; k + + )
tab [ i * esz + k ] = ( int ) ( ( size . width - i - 1 ) * esz + k ) ;
for ( ; size . height - - ; src + = sstep , dst + = dstep )
{
for ( int i = 0 , j = end - 2 * v_uint8x16 : : nlanes ; i < width ; i + = 2 * v_uint8x16 : : nlanes , j - = 2 * v_uint8x16 : : nlanes )
{
# if CV_SIMD256
v_uint8x32 t0 , t1 ;
for ( ; size . height - - ; src + = sstep , dst + = dstep )
t0 = v256_load ( ( uchar * ) src + i ) ;
t1 = v256_load ( ( uchar * ) src + j ) ;
v_store ( dst + j , t0 ) ;
v_store ( dst + i , t1 ) ;
# else
v_uint8x16 t0 , t1 , t2 , t3 ;
t0 = v_load ( ( uchar * ) src + i ) ;
t1 = v_load ( ( uchar * ) src + i + v_uint8x16 : : nlanes ) ;
t2 = v_load ( ( uchar * ) src + j ) ;
t3 = v_load ( ( uchar * ) src + j + v_uint8x16 : : nlanes ) ;
v_store ( dst + j , t0 ) ;
v_store ( dst + j + v_uint8x16 : : nlanes , t1 ) ;
v_store ( dst + i , t2 ) ;
v_store ( dst + i + v_uint8x16 : : nlanes , t3 ) ;
# endif
}
}
}
else if ( esz = = v_uint8x16 : : nlanes )
{
for ( i = 0 ; i < limit ; i + + )
int end = ( int ) ( size . width * esz ) ;
int width = end / 2 ;
for ( ; size . height - - ; src + = sstep , dst + = dstep )
{
for ( int i = 0 , j = end - v_uint8x16 : : nlanes ; i < width ; i + = v_uint8x16 : : nlanes , j - = v_uint8x16 : : nlanes )
{
v_uint8x16 t0 , t1 ;
t0 = v_load ( ( uchar * ) src + i ) ;
t1 = v_load ( ( uchar * ) src + j ) ;
v_store ( dst + j , t0 ) ;
v_store ( dst + i , t1 ) ;
}
}
}
else if ( esz = = 8 )
{
flipHoriz_single < v_uint64x2 > ( src , sstep , dst , dstep , size , esz ) ;
}
else if ( esz = = 4 )
{
flipHoriz_single < v_uint32x4 > ( src , sstep , dst , dstep , size , esz ) ;
}
else if ( esz = = 2 )
{
flipHoriz_single < v_uint16x8 > ( src , sstep , dst , dstep , size , esz ) ;
}
else if ( esz = = 1 )
{
flipHoriz_single < v_uint8x16 > ( src , sstep , dst , dstep , size , esz ) ;
}
else if ( esz = = 24 )
{
int end = ( int ) ( size . width * esz ) ;
int width = ( end + 1 ) / 2 ;
for ( ; size . height - - ; src + = sstep , dst + = dstep )
{
j = tab [ i ] ;
uchar t0 = src [ i ] , t1 = src [ j ] ;
dst [ i ] = t1 ; dst [ j ] = t0 ;
for ( int i = 0 , j = end ; i < width ; i + = v_uint8x16 : : nlanes + sizeof ( uint64_t ) , j - = v_uint8x16 : : nlanes + sizeof ( uint64_t ) )
{
v_uint8x16 t0 , t1 ;
uint64_t t2 , t3 ;
t0 = v_load ( ( uchar * ) src + i ) ;
t2 = * ( ( uint64_t * ) ( ( uchar * ) src + i + v_uint8x16 : : nlanes ) ) ;
t1 = v_load ( ( uchar * ) src + j - v_uint8x16 : : nlanes - sizeof ( uint64_t ) ) ;
t3 = * ( ( uint64_t * ) ( ( uchar * ) src + j - sizeof ( uint64_t ) ) ) ;
v_store ( dst + j - v_uint8x16 : : nlanes - sizeof ( uint64_t ) , t0 ) ;
* ( ( uint64_t * ) ( dst + j - sizeof ( uint64_t ) ) ) = t2 ;
v_store ( dst + i , t1 ) ;
* ( ( uint64_t * ) ( dst + i + v_uint8x16 : : nlanes ) ) = t3 ;
}
}
}
else if ( esz = = 12 )
{
flipHoriz_double < uint64_t , uint > ( src , sstep , dst , dstep , size , esz ) ;
}
else if ( esz = = 6 )
{
flipHoriz_double < uint , ushort > ( src , sstep , dst , dstep , size , esz ) ;
}
else if ( esz = = 3 )
{
flipHoriz_double < ushort , uchar > ( src , sstep , dst , dstep , size , esz ) ;
}
else
# endif
{
int i , j , limit = ( int ) ( ( ( size . width + 1 ) / 2 ) * esz ) ;
AutoBuffer < int > _tab ( size . width * esz ) ;
int * tab = _tab . data ( ) ;
for ( i = 0 ; i < size . width ; i + + )
for ( size_t k = 0 ; k < esz ; k + + )
tab [ i * esz + k ] = ( int ) ( ( size . width - i - 1 ) * esz + k ) ;
for ( ; size . height - - ; src + = sstep , dst + = dstep )
{
for ( i = 0 ; i < limit ; i + + )
{
j = tab [ i ] ;
uchar t0 = src [ i ] , t1 = src [ j ] ;
dst [ i ] = t1 ; dst [ j ] = t0 ;
}
}
}
}
@ -597,6 +778,16 @@ flipVert( const uchar* src0, size_t sstep, uchar* dst0, size_t dstep, Size size,
dst0 + = dstep , dst1 - = dstep )
{
int i = 0 ;
# if CV_SIMD
for ( ; i < = size . width - ( v_int32 : : nlanes * 4 ) ; i + = v_int32 : : nlanes * 4 )
{
v_int32 t0 = vx_load ( ( int * ) ( src0 + i ) ) ;
v_int32 t1 = vx_load ( ( int * ) ( src1 + i ) ) ;
vx_store ( ( int * ) ( dst0 + i ) , t1 ) ;
vx_store ( ( int * ) ( dst1 + i ) , t0 ) ;
}
# endif
if ( ( ( size_t ) src0 | ( size_t ) dst0 | ( size_t ) src1 | ( size_t ) dst1 ) % sizeof ( int ) = = 0 )
{
for ( ; i < = size . width - 16 ; i + = 16 )