@ -1451,115 +1451,82 @@ transform_( const T* src, T* dst, const WT* m, int len, int scn, int dcn )
}
}
# if CV_SIMD128 && !defined(__aarch64__)
static inline void
load3x3Matrix ( const float * m , v_float32x4 & m0 , v_float32x4 & m1 , v_float32x4 & m2 , v_float32x4 & m3 )
{
m0 = v_float32x4 ( m [ 0 ] , m [ 4 ] , m [ 8 ] , 0 ) ;
m1 = v_float32x4 ( m [ 1 ] , m [ 5 ] , m [ 9 ] , 0 ) ;
m2 = v_float32x4 ( m [ 2 ] , m [ 6 ] , m [ 10 ] , 0 ) ;
m3 = v_float32x4 ( m [ 3 ] , m [ 7 ] , m [ 11 ] , 0 ) ;
}
# endif
# if CV_SIMD128
static inline v_int16x8
v_matmulvec ( const v_int16x8 & v0 , const v_int16x8 & m0 , const v_int16x8 & m1 , const v_int16x8 & m2 , const v_int32x4 & m3 , const int BITS )
{
// v0 : 0 b0 g0 r0 b1 g1 r1 ?
v_int32x4 t0 = v_dotprod ( v0 , m0 ) ; // a0 b0 a1 b1
v_int32x4 t1 = v_dotprod ( v0 , m1 ) ; // c0 d0 c1 d1
v_int32x4 t2 = v_dotprod ( v0 , m2 ) ; // e0 f0 e1 f1
v_int32x4 t3 = v_setzero_s32 ( ) ;
v_int32x4 s0 , s1 , s2 , s3 ;
v_transpose4x4 ( t0 , t1 , t2 , t3 , s0 , s1 , s2 , s3 ) ;
s0 = s0 + s1 + m3 ; // B0 G0 R0 ?
s2 = s2 + s3 + m3 ; // B1 G1 R1 ?
s0 = s0 > > BITS ;
s2 = s2 > > BITS ;
v_int16x8 result = v_pack ( s0 , v_setzero_s32 ( ) ) ; // B0 G0 R0 0 0 0 0 0
result = v_reinterpret_as_s16 ( v_reinterpret_as_s64 ( result ) < < 16 ) ; // 0 B0 G0 R0 0 0 0 0
result = result | v_pack ( v_setzero_s32 ( ) , s2 ) ; // 0 B0 G0 R0 B1 G1 R1 0
return result ;
}
# endif
static void
transform_8u ( const uchar * src , uchar * dst , const float * m , int len , int scn , int dcn )
{
# if CV_SIMD128
# if CV_SIMD
const int BITS = 10 , SCALE = 1 < < BITS ;
const float MAX_M = ( float ) ( 1 < < ( 15 - BITS ) ) ;
if ( hasSIMD128 ( ) & & scn = = 3 & & dcn = = 3 & &
std : : abs ( m [ 0 ] ) < MAX_M & & std : : abs ( m [ 1 ] ) < MAX_M & & std : : abs ( m [ 2 ] ) < MAX_M & & std : : abs ( m [ 3 ] ) < MAX_M * 256 & &
std : : abs ( m [ 4 ] ) < MAX_M & & std : : abs ( m [ 5 ] ) < MAX_M & & std : : abs ( m [ 6 ] ) < MAX_M & & std : : abs ( m [ 7 ] ) < MAX_M * 256 & &
std : : abs ( m [ 8 ] ) < MAX_M & & std : : abs ( m [ 9 ] ) < MAX_M & & std : : abs ( m [ 10 ] ) < MAX_M & & std : : abs ( m [ 11 ] ) < MAX_M * 256 )
if ( scn = = 3 & & dcn = = 3 & &
std : : abs ( m [ 0 ] ) < MAX_M & & std : : abs ( m [ 1 ] ) < MAX_M & & std : : abs ( m [ 2 ] ) < MAX_M * 256 & & std : : abs ( m [ 3 ] ) < MAX_M * 256 & &
std : : abs ( m [ 4 ] ) < MAX_M & & std : : abs ( m [ 5 ] ) < MAX_M & & std : : abs ( m [ 6 ] ) < MAX_M * 256 & & std : : abs ( m [ 7 ] ) < MAX_M * 256 & &
std : : abs ( m [ 8 ] ) < MAX_M & & std : : abs ( m [ 9 ] ) < MAX_M & & std : : abs ( m [ 10 ] ) < MAX_M * 256 & & std : : abs ( m [ 11 ] ) < MAX_M * 256 )
{
const int nChannels = 3 ;
const int cWidth = v_int16x8 : : nlanes ;
// faster fixed-point transformation
short m00 = saturate_cast < short > ( m [ 0 ] * SCALE ) , m01 = saturate_cast < short > ( m [ 1 ] * SCALE ) ,
m02 = saturate_cast < short > ( m [ 2 ] * SCALE ) , m10 = saturate_cast < short > ( m [ 4 ] * SCALE ) ,
m11 = saturate_cast < short > ( m [ 5 ] * SCALE ) , m12 = saturate_cast < short > ( m [ 6 ] * SCALE ) ,
m20 = saturate_cast < short > ( m [ 8 ] * SCALE ) , m21 = saturate_cast < short > ( m [ 9 ] * SCALE ) ,
m22 = saturate_cast < short > ( m [ 10 ] * SCALE ) ;
int m03 = saturate_cast < int > ( ( m [ 3 ] + 0.5f ) * SCALE ) , m13 = saturate_cast < int > ( ( m [ 7 ] + 0.5f ) * SCALE ) ,
m23 = saturate_cast < int > ( ( m [ 11 ] + 0.5f ) * SCALE ) ;
v_int16x8 m0 = v_int16x8 ( 0 , m00 , m01 , m02 , m00 , m01 , m02 , 0 ) ;
v_int16x8 m1 = v_int16x8 ( 0 , m10 , m11 , m12 , m10 , m11 , m12 , 0 ) ;
v_int16x8 m2 = v_int16x8 ( 0 , m20 , m21 , m22 , m20 , m21 , m22 , 0 ) ;
v_int32x4 m3 = v_int32x4 ( m03 , m13 , m23 , 0 ) ;
int x = 0 ;
for ( ; x < = ( len - cWidth ) * nChannels ; x + = cWidth * nChannels )
union {
short s [ 6 ] ;
int p [ 3 ] ;
} m16 ;
m16 . s [ 0 ] = saturate_cast < short > ( m [ 0 ] * SCALE ) ; m16 . s [ 1 ] = saturate_cast < short > ( m [ 1 ] * SCALE ) ;
m16 . s [ 2 ] = saturate_cast < short > ( m [ 4 ] * SCALE ) ; m16 . s [ 3 ] = saturate_cast < short > ( m [ 5 ] * SCALE ) ;
m16 . s [ 4 ] = saturate_cast < short > ( m [ 8 ] * SCALE ) ; m16 . s [ 5 ] = saturate_cast < short > ( m [ 9 ] * SCALE ) ;
int m32 [ ] = { saturate_cast < int > ( m [ 2 ] * SCALE ) , saturate_cast < int > ( m [ 3 ] * SCALE ) ,
saturate_cast < int > ( m [ 6 ] * SCALE ) , saturate_cast < int > ( m [ 7 ] * SCALE ) ,
saturate_cast < int > ( m [ 10 ] * SCALE ) , saturate_cast < int > ( m [ 11 ] * SCALE ) } ;
v_int16 m01 = v_reinterpret_as_s16 ( vx_setall_s32 ( m16 . p [ 0 ] ) ) ;
v_int32 m2 = vx_setall_s32 ( m32 [ 0 ] ) ;
v_int32 m3 = vx_setall_s32 ( m32 [ 1 ] ) ;
v_int16 m45 = v_reinterpret_as_s16 ( vx_setall_s32 ( m16 . p [ 1 ] ) ) ;
v_int32 m6 = vx_setall_s32 ( m32 [ 2 ] ) ;
v_int32 m7 = vx_setall_s32 ( m32 [ 3 ] ) ;
v_int16 m89 = v_reinterpret_as_s16 ( vx_setall_s32 ( m16 . p [ 2 ] ) ) ;
v_int32 m10 = vx_setall_s32 ( m32 [ 4 ] ) ;
v_int32 m11 = vx_setall_s32 ( m32 [ 5 ] ) ;
int x = 0 ;
for ( ; x < = ( len - v_uint8 : : nlanes ) * nChannels ; x + = v_uint8 : : nlanes * nChannels )
{
// load 8 pixels
v_int16x8 v0 = v_reinterpret_as_s16 ( v_load_expand ( src + x ) ) ;
v_int16x8 v1 = v_reinterpret_as_s16 ( v_load_expand ( src + x + cWidth ) ) ;
v_int16x8 v2 = v_reinterpret_as_s16 ( v_load_expand ( src + x + cWidth * 2 ) ) ;
v_int16x8 v3 ;
// rotate and pack
v3 = v_rotate_right < 1 > ( v2 ) ; // 0 b6 g6 r6 b7 g7 r7 0
v2 = v_rotate_left < 5 > ( v2 , v1 ) ; // 0 b4 g4 r4 b5 g5 r5 0
v1 = v_rotate_left < 3 > ( v1 , v0 ) ; // 0 b2 g2 r2 b3 g3 r3 0
v0 = v_rotate_left < 1 > ( v0 ) ; // 0 b0 g0 r0 b1 g1 r1 0
// multiply with matrix and normalize
v0 = v_matmulvec ( v0 , m0 , m1 , m2 , m3 , BITS ) ; // 0 B0 G0 R0 B1 G1 R1 0
v1 = v_matmulvec ( v1 , m0 , m1 , m2 , m3 , BITS ) ; // 0 B2 G2 R2 B3 G3 R3 0
v2 = v_matmulvec ( v2 , m0 , m1 , m2 , m3 , BITS ) ; // 0 B4 G4 R4 B5 G5 R5 0
v3 = v_matmulvec ( v3 , m0 , m1 , m2 , m3 , BITS ) ; // 0 B6 G6 R6 B7 G7 R7 0
// narrow down as uint8x16
v_uint8x16 z0 = v_pack_u ( v0 , v_setzero_s16 ( ) ) ; // 0 B0 G0 R0 B1 G1 R1 0 0 0 0 0 0 0 0 0
v_uint8x16 z1 = v_pack_u ( v1 , v_setzero_s16 ( ) ) ; // 0 B2 G2 R2 B3 G3 R3 0 0 0 0 0 0 0 0 0
v_uint8x16 z2 = v_pack_u ( v2 , v_setzero_s16 ( ) ) ; // 0 B4 G4 R4 B5 G5 R5 0 0 0 0 0 0 0 0 0
v_uint8x16 z3 = v_pack_u ( v3 , v_setzero_s16 ( ) ) ; // 0 B6 G6 R6 B7 G7 R7 0 0 0 0 0 0 0 0 0
// rotate and pack
z0 = v_reinterpret_as_u8 ( v_reinterpret_as_u64 ( z0 ) > > 8 ) | v_reinterpret_as_u8 ( v_reinterpret_as_u64 ( z1 ) < < 40 ) ; // B0 G0 R0 B1 G1 R1 B2 G2 0 0 0 0 0 0 0 0
z1 = v_reinterpret_as_u8 ( v_reinterpret_as_u64 ( z1 ) > > 24 ) | v_reinterpret_as_u8 ( v_reinterpret_as_u64 ( z2 ) < < 24 ) ; // R2 B3 G3 R3 B4 G4 R4 B5 0 0 0 0 0 0 0 0
z2 = v_reinterpret_as_u8 ( v_reinterpret_as_u64 ( z2 ) > > 40 ) | v_reinterpret_as_u8 ( v_reinterpret_as_u64 ( z3 ) < < 8 ) ; // G5 R6 B6 G6 R6 B7 G7 R7 0 0 0 0 0 0 0 0
// store on memory
v_store_low ( dst + x , z0 ) ;
v_store_low ( dst + x + cWidth , z1 ) ;
v_store_low ( dst + x + cWidth * 2 , z2 ) ;
v_uint8 b , g , r ;
v_load_deinterleave ( src + x , b , g , r ) ;
v_uint8 bgl , bgh ;
v_zip ( b , g , bgl , bgh ) ;
v_uint16 rl , rh ;
v_expand ( r , rl , rh ) ;
v_int16 dbl , dbh , dgl , dgh , drl , drh ;
v_uint16 p0 , p2 ;
v_int32 p1 , p3 ;
v_expand ( bgl , p0 , p2 ) ;
v_expand ( v_reinterpret_as_s16 ( rl ) , p1 , p3 ) ;
dbl = v_rshr_pack < BITS > ( v_dotprod ( v_reinterpret_as_s16 ( p0 ) , m01 ) + p1 * m2 + m3 ,
v_dotprod ( v_reinterpret_as_s16 ( p2 ) , m01 ) + p3 * m2 + m3 ) ;
dgl = v_rshr_pack < BITS > ( v_dotprod ( v_reinterpret_as_s16 ( p0 ) , m45 ) + p1 * m6 + m7 ,
v_dotprod ( v_reinterpret_as_s16 ( p2 ) , m45 ) + p3 * m6 + m7 ) ;
drl = v_rshr_pack < BITS > ( v_dotprod ( v_reinterpret_as_s16 ( p0 ) , m89 ) + p1 * m10 + m11 ,
v_dotprod ( v_reinterpret_as_s16 ( p2 ) , m89 ) + p3 * m10 + m11 ) ;
v_expand ( bgh , p0 , p2 ) ;
v_expand ( v_reinterpret_as_s16 ( rh ) , p1 , p3 ) ;
dbh = v_rshr_pack < BITS > ( v_dotprod ( v_reinterpret_as_s16 ( p0 ) , m01 ) + p1 * m2 + m3 ,
v_dotprod ( v_reinterpret_as_s16 ( p2 ) , m01 ) + p3 * m2 + m3 ) ;
dgh = v_rshr_pack < BITS > ( v_dotprod ( v_reinterpret_as_s16 ( p0 ) , m45 ) + p1 * m6 + m7 ,
v_dotprod ( v_reinterpret_as_s16 ( p2 ) , m45 ) + p3 * m6 + m7 ) ;
drh = v_rshr_pack < BITS > ( v_dotprod ( v_reinterpret_as_s16 ( p0 ) , m89 ) + p1 * m10 + m11 ,
v_dotprod ( v_reinterpret_as_s16 ( p2 ) , m89 ) + p3 * m10 + m11 ) ;
v_store_interleave ( dst + x , v_pack_u ( dbl , dbh ) , v_pack_u ( dgl , dgh ) , v_pack_u ( drl , drh ) ) ;
}
m32 [ 1 ] = saturate_cast < int > ( ( m [ 3 ] + 0.5f ) * SCALE ) ;
m32 [ 3 ] = saturate_cast < int > ( ( m [ 7 ] + 0.5f ) * SCALE ) ;
m32 [ 5 ] = saturate_cast < int > ( ( m [ 11 ] + 0.5f ) * SCALE ) ;
for ( ; x < len * nChannels ; x + = nChannels )
{
int v0 = src [ x ] , v1 = src [ x + 1 ] , v2 = src [ x + 2 ] ;
uchar t0 = saturate_cast < uchar > ( ( m00 * v0 + m0 1 * v1 + m02 * v2 + m0 3 ) > > BITS ) ;
uchar t1 = saturate_cast < uchar > ( ( m10 * v0 + m11 * v1 + m12 * v2 + m13 ) > > BITS ) ;
uchar t2 = saturate_cast < uchar > ( ( m20 * v0 + m2 1 * v1 + m22 * v2 + m23 ) > > BITS ) ;
uchar t0 = saturate_cast < uchar > ( ( m16 . s [ 0 ] * v0 + m16 . s [ 1 ] * v1 + m32 [ 0 ] * v2 + m32 [ 1 ] ) > > BITS ) ;
uchar t1 = saturate_cast < uchar > ( ( m16 . s [ 2 ] * v0 + m16 . s [ 3 ] * v1 + m32 [ 2 ] * v2 + m32 [ 3 ] ) > > BITS ) ;
uchar t2 = saturate_cast < uchar > ( ( m16 . s [ 4 ] * v0 + m16 . s [ 5 ] * v1 + m32 [ 4 ] * v2 + m32 [ 5 ] ) > > BITS ) ;
dst [ x ] = t0 ; dst [ x + 1 ] = t1 ; dst [ x + 2 ] = t2 ;
}
vx_cleanup ( ) ;
return ;
}
# endif
@ -1570,64 +1537,65 @@ transform_8u( const uchar* src, uchar* dst, const float* m, int len, int scn, in
static void
transform_16u ( const ushort * src , ushort * dst , const float * m , int len , int scn , int dcn )
{
# if CV_SIMD128 && !defined(__aarch64__)
if ( hasSIMD128 ( ) & & scn = = 3 & & dcn = = 3 )
# if CV_SIMD && !defined(__aarch64__)
if ( scn = = 3 & & dcn = = 3 )
{
const int nChannels = 3 ;
const int cWidth = v_float32x4 : : nlanes ;
v_int16x8 delta = v_int16x8 ( 0 , - 32768 , - 32768 , - 32768 , - 32768 , - 32768 , - 32768 , 0 ) ;
v_float32x4 m0 , m1 , m2 , m3 ;
load3x3Matrix ( m , m0 , m1 , m2 , m3 ) ;
m3 - = v_float32x4 ( 32768.f , 32768.f , 32768.f , 0.f ) ;
int x = 0 ;
for ( ; x < = ( len - cWidth ) * nChannels ; x + = cWidth * nChannels )
# if CV_SIMD_WIDTH > 16
v_float32 m0 = vx_setall_f32 ( m [ 0 ] ) ;
v_float32 m1 = vx_setall_f32 ( m [ 1 ] ) ;
v_float32 m2 = vx_setall_f32 ( m [ 2 ] ) ;
v_float32 m3 = vx_setall_f32 ( m [ 3 ] - 32768.f ) ;
v_float32 m4 = vx_setall_f32 ( m [ 4 ] ) ;
v_float32 m5 = vx_setall_f32 ( m [ 5 ] ) ;
v_float32 m6 = vx_setall_f32 ( m [ 6 ] ) ;
v_float32 m7 = vx_setall_f32 ( m [ 7 ] - 32768.f ) ;
v_float32 m8 = vx_setall_f32 ( m [ 8 ] ) ;
v_float32 m9 = vx_setall_f32 ( m [ 9 ] ) ;
v_float32 m10 = vx_setall_f32 ( m [ 10 ] ) ;
v_float32 m11 = vx_setall_f32 ( m [ 11 ] - 32768.f ) ;
v_int16 delta = vx_setall_s16 ( - 32768 ) ;
for ( ; x < = ( len - v_uint16 : : nlanes ) * 3 ; x + = v_uint16 : : nlanes * 3 )
{
// load 4 pixels
v_uint16x8 v0_16 = v_load ( src + x ) ; // b0 g0 r0 b1 g1 r1 b2 g2
v_uint16x8 v2_16 = v_load_low ( src + x + cWidth * 2 ) ; // r2 b3 g3 r3 ? ? ? ?
// expand to 4 vectors
v_uint32x4 v0_32 , v1_32 , v2_32 , v3_32 , dummy_32 ;
v_expand ( v_rotate_right < 3 > ( v0_16 ) , v1_32 , dummy_32 ) ; // b1 g1 r1
v_expand ( v_rotate_right < 1 > ( v2_16 ) , v3_32 , dummy_32 ) ; // b3 g3 r3
v_expand ( v_rotate_right < 6 > ( v0_16 , v2_16 ) , v2_32 , dummy_32 ) ; // b2 g2 r2
v_expand ( v0_16 , v0_32 , dummy_32 ) ; // b0 g0 r0
// convert to float32x4
v_float32x4 x0 = v_cvt_f32 ( v_reinterpret_as_s32 ( v0_32 ) ) ; // b0 g0 r0
v_float32x4 x1 = v_cvt_f32 ( v_reinterpret_as_s32 ( v1_32 ) ) ; // b1 g1 r1
v_float32x4 x2 = v_cvt_f32 ( v_reinterpret_as_s32 ( v2_32 ) ) ; // b2 g2 r2
v_float32x4 x3 = v_cvt_f32 ( v_reinterpret_as_s32 ( v3_32 ) ) ; // b3 g3 r3
// multiply and convert back to int32x4
v_int32x4 y0 , y1 , y2 , y3 ;
y0 = v_round ( v_matmuladd ( x0 , m0 , m1 , m2 , m3 ) ) ; // B0 G0 R0
y1 = v_round ( v_matmuladd ( x1 , m0 , m1 , m2 , m3 ) ) ; // B1 G1 R1
y2 = v_round ( v_matmuladd ( x2 , m0 , m1 , m2 , m3 ) ) ; // B2 G2 R2
y3 = v_round ( v_matmuladd ( x3 , m0 , m1 , m2 , m3 ) ) ; // B3 G3 R3
// narrow down to int16x8
v_int16x8 v0 = v_add_wrap ( v_pack ( v_rotate_left < 1 > ( y0 ) , y1 ) , delta ) ; // 0 B0 G0 R0 B1 G1 R1 0
v_int16x8 v2 = v_add_wrap ( v_pack ( v_rotate_left < 1 > ( y2 ) , y3 ) , delta ) ; // 0 B2 G2 R2 B3 G3 R3 0
// rotate and pack
v0 = v_rotate_right < 1 > ( v0 ) | v_rotate_left < 5 > ( v2 ) ; // B0 G0 R0 B1 G1 R1 B2 G2
v2 = v_rotate_right < 3 > ( v2 ) ; // R2 B3 G3 R3 0 0 0 0
// store 4 pixels
v_store ( dst + x , v_reinterpret_as_u16 ( v0 ) ) ;
v_store_low ( dst + x + cWidth * 2 , v_reinterpret_as_u16 ( v2 ) ) ;
v_uint16 b , g , r ;
v_load_deinterleave ( src + x , b , g , r ) ;
v_uint32 bl , bh , gl , gh , rl , rh ;
v_expand ( b , bl , bh ) ;
v_expand ( g , gl , gh ) ;
v_expand ( r , rl , rh ) ;
v_int16 db , dg , dr ;
db = v_add_wrap ( v_pack ( v_round ( v_muladd ( v_cvt_f32 ( v_reinterpret_as_s32 ( bl ) ) , m0 , v_muladd ( v_cvt_f32 ( v_reinterpret_as_s32 ( gl ) ) , m1 , v_muladd ( v_cvt_f32 ( v_reinterpret_as_s32 ( rl ) ) , m2 , m3 ) ) ) ) ,
v_round ( v_muladd ( v_cvt_f32 ( v_reinterpret_as_s32 ( bh ) ) , m0 , v_muladd ( v_cvt_f32 ( v_reinterpret_as_s32 ( gh ) ) , m1 , v_muladd ( v_cvt_f32 ( v_reinterpret_as_s32 ( rh ) ) , m2 , m3 ) ) ) ) ) , delta ) ;
dg = v_add_wrap ( v_pack ( v_round ( v_muladd ( v_cvt_f32 ( v_reinterpret_as_s32 ( bl ) ) , m4 , v_muladd ( v_cvt_f32 ( v_reinterpret_as_s32 ( gl ) ) , m5 , v_muladd ( v_cvt_f32 ( v_reinterpret_as_s32 ( rl ) ) , m6 , m7 ) ) ) ) ,
v_round ( v_muladd ( v_cvt_f32 ( v_reinterpret_as_s32 ( bh ) ) , m4 , v_muladd ( v_cvt_f32 ( v_reinterpret_as_s32 ( gh ) ) , m5 , v_muladd ( v_cvt_f32 ( v_reinterpret_as_s32 ( rh ) ) , m6 , m7 ) ) ) ) ) , delta ) ;
dr = v_add_wrap ( v_pack ( v_round ( v_muladd ( v_cvt_f32 ( v_reinterpret_as_s32 ( bl ) ) , m8 , v_muladd ( v_cvt_f32 ( v_reinterpret_as_s32 ( gl ) ) , m9 , v_muladd ( v_cvt_f32 ( v_reinterpret_as_s32 ( rl ) ) , m10 , m11 ) ) ) ) ,
v_round ( v_muladd ( v_cvt_f32 ( v_reinterpret_as_s32 ( bh ) ) , m8 , v_muladd ( v_cvt_f32 ( v_reinterpret_as_s32 ( gh ) ) , m9 , v_muladd ( v_cvt_f32 ( v_reinterpret_as_s32 ( rh ) ) , m10 , m11 ) ) ) ) ) , delta ) ;
v_store_interleave ( dst + x , v_reinterpret_as_u16 ( db ) , v_reinterpret_as_u16 ( dg ) , v_reinterpret_as_u16 ( dr ) ) ;
}
for ( ; x < len * nChannels ; x + = nChannels )
# endif
v_float32x4 _m0l ( m [ 0 ] , m [ 4 ] , m [ 8 ] , 0.f ) ;
v_float32x4 _m1l ( m [ 1 ] , m [ 5 ] , m [ 9 ] , 0.f ) ;
v_float32x4 _m2l ( m [ 2 ] , m [ 6 ] , m [ 10 ] , 0.f ) ;
v_float32x4 _m3l ( m [ 3 ] - 32768.f , m [ 7 ] - 32768.f , m [ 11 ] - 32768.f , 0.f ) ;
v_float32x4 _m0h = v_rotate_left < 1 > ( _m0l ) ;
v_float32x4 _m1h = v_rotate_left < 1 > ( _m1l ) ;
v_float32x4 _m2h = v_rotate_left < 1 > ( _m2l ) ;
v_float32x4 _m3h = v_rotate_left < 1 > ( _m3l ) ;
v_int16x8 _delta ( 0 , - 32768 , - 32768 , - 32768 , - 32768 , - 32768 , - 32768 , 0 ) ;
for ( ; x < = len * 3 - v_uint16x8 : : nlanes ; x + = 3 * v_uint16x8 : : nlanes / 4 )
v_store ( dst + x , v_rotate_right < 1 > ( v_reinterpret_as_u16 ( v_add_wrap ( v_pack (
v_round ( v_matmuladd ( v_cvt_f32 ( v_reinterpret_as_s32 ( v_load_expand ( src + x ) ) ) , _m0h , _m1h , _m2h , _m3h ) ) ,
v_round ( v_matmuladd ( v_cvt_f32 ( v_reinterpret_as_s32 ( v_load_expand ( src + x + 3 ) ) ) , _m0l , _m1l , _m2l , _m3l ) ) ) , _delta ) ) ) ) ;
for ( ; x < len * 3 ; x + = 3 )
{
float v0 = src [ x ] , v1 = src [ x + 1 ] , v2 = src [ x + 2 ] ;
ushort t0 = saturate_cast < ushort > ( m [ 0 ] * v0 + m [ 1 ] * v1 + m [ 2 ] * v2 + m [ 3 ] ) ;
ushort t1 = saturate_cast < ushort > ( m [ 4 ] * v0 + m [ 5 ] * v1 + m [ 6 ] * v2 + m [ 7 ] ) ;
ushort t0 = saturate_cast < ushort > ( m [ 0 ] * v0 + m [ 1 ] * v1 + m [ 2 ] * v2 + m [ 3 ] ) ;
ushort t1 = saturate_cast < ushort > ( m [ 4 ] * v0 + m [ 5 ] * v1 + m [ 6 ] * v2 + m [ 7 ] ) ;
ushort t2 = saturate_cast < ushort > ( m [ 8 ] * v0 + m [ 9 ] * v1 + m [ 10 ] * v2 + m [ 11 ] ) ;
dst [ x ] = t0 ; dst [ x + 1 ] = t1 ; dst [ x + 2 ] = t2 ;
}
vx_cleanup ( ) ;
return ;
}
# endif
@ -1638,52 +1606,68 @@ transform_16u( const ushort* src, ushort* dst, const float* m, int len, int scn,
static void
transform_32f ( const float * src , float * dst , const float * m , int len , int scn , int dcn )
{
# if CV_SIMD128 && !defined(__aarch64__)
if ( hasSIMD128 ( ) )
# if CV_SIMD && !defined(__aarch64__)
int x = 0 ;
if ( scn = = 3 & & dcn = = 3 )
{
int x = 0 ;
if ( scn = = 3 & & dcn = = 3 )
int idx [ v_float32 : : nlanes / 2 ] ;
for ( int i = 0 ; i < v_float32 : : nlanes / 4 ; i + + )
{
const int cWidth = 3 ;
v_float32x4 m0 , m1 , m2 , m3 ;
load3x3Matrix ( m , m0 , m1 , m2 , m3 ) ;
for ( ; x < ( len - 1 ) * cWidth ; x + = cWidth )
{
v_float32x4 x0 = v_load ( src + x ) ;
v_float32x4 y0 = v_matmuladd ( x0 , m0 , m1 , m2 , m3 ) ;
v_store_low ( dst + x , y0 ) ;
dst [ x + 2 ] = v_combine_high ( y0 , y0 ) . get0 ( ) ;
}
for ( ; x < len * cWidth ; x + = cWidth )
{
float v0 = src [ x ] , v1 = src [ x + 1 ] , v2 = src [ x + 2 ] ;
float t0 = saturate_cast < float > ( m [ 0 ] * v0 + m [ 1 ] * v1 + m [ 2 ] * v2 + m [ 3 ] ) ;
float t1 = saturate_cast < float > ( m [ 4 ] * v0 + m [ 5 ] * v1 + m [ 6 ] * v2 + m [ 7 ] ) ;
float t2 = saturate_cast < float > ( m [ 8 ] * v0 + m [ 9 ] * v1 + m [ 10 ] * v2 + m [ 11 ] ) ;
dst [ x ] = t0 ; dst [ x + 1 ] = t1 ; dst [ x + 2 ] = t2 ;
}
return ;
idx [ i ] = 3 * i ;
idx [ i + v_float32 : : nlanes / 4 ] = 0 ;
}
float _m [ ] = { m [ 0 ] , m [ 4 ] , m [ 8 ] , 0.f ,
m [ 1 ] , m [ 5 ] , m [ 9 ] , 0.f ,
m [ 2 ] , m [ 6 ] , m [ 10 ] , 0.f ,
m [ 3 ] , m [ 7 ] , m [ 11 ] , 0.f } ;
v_float32 m0 = vx_lut_quads ( _m , idx + v_float32 : : nlanes / 4 ) ;
v_float32 m1 = vx_lut_quads ( _m + 4 , idx + v_float32 : : nlanes / 4 ) ;
v_float32 m2 = vx_lut_quads ( _m + 8 , idx + v_float32 : : nlanes / 4 ) ;
v_float32 m3 = vx_lut_quads ( _m + 12 , idx + v_float32 : : nlanes / 4 ) ;
for ( ; x < = len * 3 - v_float32 : : nlanes ; x + = 3 * v_float32 : : nlanes / 4 )
v_store ( dst + x , v_pack_triplets ( v_matmuladd ( vx_lut_quads ( src + x , idx ) , m0 , m1 , m2 , m3 ) ) ) ;
for ( ; x < len * 3 ; x + = 3 )
{
float v0 = src [ x ] , v1 = src [ x + 1 ] , v2 = src [ x + 2 ] ;
float t0 = saturate_cast < float > ( m [ 0 ] * v0 + m [ 1 ] * v1 + m [ 2 ] * v2 + m [ 3 ] ) ;
float t1 = saturate_cast < float > ( m [ 4 ] * v0 + m [ 5 ] * v1 + m [ 6 ] * v2 + m [ 7 ] ) ;
float t2 = saturate_cast < float > ( m [ 8 ] * v0 + m [ 9 ] * v1 + m [ 10 ] * v2 + m [ 11 ] ) ;
dst [ x ] = t0 ; dst [ x + 1 ] = t1 ; dst [ x + 2 ] = t2 ;
}
vx_cleanup ( ) ;
return ;
}
if ( scn = = 4 & & dcn = = 4 )
if ( scn = = 4 & & dcn = = 4 )
{
# if CV_SIMD_WIDTH > 16
int idx [ v_float32 : : nlanes / 4 ] ;
for ( int i = 0 ; i < v_float32 : : nlanes / 4 ; i + + )
idx [ i ] = 0 ;
float _m [ ] = { m [ 4 ] , m [ 9 ] , m [ 14 ] , m [ 19 ] } ;
v_float32 m0 = vx_lut_quads ( m , idx ) ;
v_float32 m1 = vx_lut_quads ( m + 5 , idx ) ;
v_float32 m2 = vx_lut_quads ( m + 10 , idx ) ;
v_float32 m3 = vx_lut_quads ( m + 15 , idx ) ;
v_float32 m4 = vx_lut_quads ( _m , idx ) ;
for ( ; x < = len * 4 - v_float32 : : nlanes ; x + = v_float32 : : nlanes )
{
const int cWidth = 4 ;
v_float32x4 m0 = v_float32x4 ( m [ 0 ] , m [ 5 ] , m [ 10 ] , m [ 15 ] ) ;
v_float32x4 m1 = v_float32x4 ( m [ 1 ] , m [ 6 ] , m [ 11 ] , m [ 16 ] ) ;
v_float32x4 m2 = v_float32x4 ( m [ 2 ] , m [ 7 ] , m [ 12 ] , m [ 17 ] ) ;
v_float32x4 m3 = v_float32x4 ( m [ 3 ] , m [ 8 ] , m [ 13 ] , m [ 18 ] ) ;
v_float32x4 m4 = v_float32x4 ( m [ 4 ] , m [ 9 ] , m [ 14 ] , m [ 19 ] ) ;
for ( ; x < len * cWidth ; x + = cWidth )
{
v_float32x4 x0 = v_load ( src + x ) ;
v_float32x4 y0 = v_matmul ( x0 , m0 , m1 , m2 , m3 ) + m4 ;
v_store ( dst + x , y0 ) ;
}
return ;
v_float32 v_src = vx_load ( src + x ) ;
v_store ( dst + x , v_reduce_sum4 ( v_src * m0 , v_src * m1 , v_src * m2 , v_src * m3 ) + m4 ) ;
}
# endif
v_float32x4 _m0 = v_load ( m ) ;
v_float32x4 _m1 = v_load ( m + 5 ) ;
v_float32x4 _m2 = v_load ( m + 10 ) ;
v_float32x4 _m3 = v_load ( m + 15 ) ;
v_float32x4 _m4 ( m [ 4 ] , m [ 9 ] , m [ 14 ] , m [ 19 ] ) ;
for ( ; x < len * 4 ; x + = v_float32x4 : : nlanes )
{
v_float32x4 v_src = v_load ( src + x ) ;
v_store ( dst + x , v_reduce_sum4 ( v_src * _m0 , v_src * _m1 , v_src * _m2 , v_src * _m3 ) + _m4 ) ;
}
vx_cleanup ( ) ;
return ;
}
# endif