@ -1699,41 +1699,53 @@ transform_( const T* src, T* dst, const WT* m, int len, int scn, int dcn )
}
}
# if CV_SSE2
# if CV_SIMD128
static inline void
load3x3Matrix ( const float * m , __m128 & m0 , __m128 & m1 , __m128 & m2 , __m128 & m3 )
load3x3Matrix ( const float * m , v_float32x4 & m0 , v_float32x4 & m1 , v_float32x4 & m2 , v_float32x4 & m3 )
{
m0 = _mm_setr_ps ( m [ 0 ] , m [ 4 ] , m [ 8 ] , 0 ) ;
m1 = _mm_setr_ps ( m [ 1 ] , m [ 5 ] , m [ 9 ] , 0 ) ;
m2 = _mm_setr_ps ( m [ 2 ] , m [ 6 ] , m [ 10 ] , 0 ) ;
m3 = _mm_setr_ps ( m [ 3 ] , m [ 7 ] , m [ 11 ] , 0 ) ;
m0 = v_float32x4 ( m [ 0 ] , m [ 4 ] , m [ 8 ] , 0 ) ;
m1 = v_float32x4 ( m [ 1 ] , m [ 5 ] , m [ 9 ] , 0 ) ;
m2 = v_float32x4 ( m [ 2 ] , m [ 6 ] , m [ 10 ] , 0 ) ;
m3 = v_float32x4 ( m [ 3 ] , m [ 7 ] , m [ 11 ] , 0 ) ;
}
static inline void
load4x4Matrix ( const float * m , __m128 & m0 , __m128 & m1 , __m128 & m2 , __m128 & m3 , __m128 & m4 )
static inline v_int16x8
v_matmulvec ( const v_int16x8 & v0 , const v_int16x8 & m0 , const v_int16x8 & m1 , const v_int16x8 & m2 , const v_int32x4 & m3 , const int BITS )
{
m0 = _mm_setr_ps ( m [ 0 ] , m [ 5 ] , m [ 10 ] , m [ 15 ] ) ;
m1 = _mm_setr_ps ( m [ 1 ] , m [ 6 ] , m [ 11 ] , m [ 16 ] ) ;
m2 = _mm_setr_ps ( m [ 2 ] , m [ 7 ] , m [ 12 ] , m [ 17 ] ) ;
m3 = _mm_setr_ps ( m [ 3 ] , m [ 8 ] , m [ 13 ] , m [ 18 ] ) ;
m4 = _mm_setr_ps ( m [ 4 ] , m [ 9 ] , m [ 14 ] , m [ 19 ] ) ;
}
// v0 : 0 b0 g0 r0 b1 g1 r1 ?
v_int32x4 t0 = v_dotprod ( v0 , m0 ) ; // a0 b0 a1 b1
v_int32x4 t1 = v_dotprod ( v0 , m1 ) ; // c0 d0 c1 d1
v_int32x4 t2 = v_dotprod ( v0 , m2 ) ; // e0 f0 e1 f1
v_int32x4 t3 = v_setzero_s32 ( ) ;
v_int32x4 s0 , s1 , s2 , s3 ;
v_transpose4x4 ( t0 , t1 , t2 , t3 , s0 , s1 , s2 , s3 ) ;
s0 = s0 + s1 + m3 ; // B0 G0 R0 ?
s2 = s2 + s3 + m3 ; // B1 G1 R1 ?
s0 = s0 > > BITS ;
s2 = s2 > > BITS ;
v_int16x8 result = v_pack ( s0 , v_setzero_s32 ( ) ) ; // B0 G0 R0 0 0 0 0 0
result = v_reinterpret_as_s16 ( v_reinterpret_as_s64 ( result ) < < 16 ) ; // 0 B0 G0 R0 0 0 0 0
result = result | v_pack ( v_setzero_s32 ( ) , s2 ) ; // 0 B0 G0 R0 B1 G1 R1 0
return result ;
}
# endif
static void
transform_8u ( const uchar * src , uchar * dst , const float * m , int len , int scn , int dcn )
{
# if CV_SSE2
# if CV_SIMD128
const int BITS = 10 , SCALE = 1 < < BITS ;
const float MAX_M = ( float ) ( 1 < < ( 15 - BITS ) ) ;
if ( USE_SSE2 & & scn = = 3 & & dcn = = 3 & &
if ( hasSIMD128 ( ) & & scn = = 3 & & dcn = = 3 & &
std : : abs ( m [ 0 ] ) < MAX_M & & std : : abs ( m [ 1 ] ) < MAX_M & & std : : abs ( m [ 2 ] ) < MAX_M & & std : : abs ( m [ 3 ] ) < MAX_M * 256 & &
std : : abs ( m [ 4 ] ) < MAX_M & & std : : abs ( m [ 5 ] ) < MAX_M & & std : : abs ( m [ 6 ] ) < MAX_M & & std : : abs ( m [ 7 ] ) < MAX_M * 256 & &
std : : abs ( m [ 8 ] ) < MAX_M & & std : : abs ( m [ 9 ] ) < MAX_M & & std : : abs ( m [ 10 ] ) < MAX_M & & std : : abs ( m [ 11 ] ) < MAX_M * 256 )
{
const int nChannels = 3 ;
const int cWidth = v_int16x8 : : nlanes ;
// faster fixed-point transformation
short m00 = saturate_cast < short > ( m [ 0 ] * SCALE ) , m01 = saturate_cast < short > ( m [ 1 ] * SCALE ) ,
m02 = saturate_cast < short > ( m [ 2 ] * SCALE ) , m10 = saturate_cast < short > ( m [ 4 ] * SCALE ) ,
@ -1743,92 +1755,50 @@ transform_8u( const uchar* src, uchar* dst, const float* m, int len, int scn, in
int m03 = saturate_cast < int > ( ( m [ 3 ] + 0.5f ) * SCALE ) , m13 = saturate_cast < int > ( ( m [ 7 ] + 0.5f ) * SCALE ) ,
m23 = saturate_cast < int > ( ( m [ 11 ] + 0.5f ) * SCALE ) ;
__m128i m0 = _mm_setr_epi16 ( 0 , m00 , m01 , m02 , m00 , m01 , m02 , 0 ) ;
__m128i m1 = _mm_setr_epi16 ( 0 , m10 , m11 , m12 , m10 , m11 , m12 , 0 ) ;
__m128i m2 = _mm_setr_epi16 ( 0 , m20 , m21 , m22 , m20 , m21 , m22 , 0 ) ;
__m128i m3 = _mm_setr_epi32 ( m03 , m13 , m23 , 0 ) ;
v_int16x8 m0 = v_int16x8 ( 0 , m00 , m01 , m02 , m00 , m01 , m02 , 0 ) ;
v_int16x8 m1 = v_int16x8 ( 0 , m10 , m11 , m12 , m10 , m11 , m12 , 0 ) ;
v_int16x8 m2 = v_int16x8 ( 0 , m20 , m21 , m22 , m20 , m21 , m22 , 0 ) ;
v_int32x4 m3 = v_int32x4 ( m03 , m13 , m23 , 0 ) ;
int x = 0 ;
for ( ; x < = ( len - 8 ) * 3 ; x + = 8 * 3 )
{
__m128i z = _mm_setzero_si128 ( ) , t0 , t1 , t2 , r0 , r1 ;
__m128i v0 = _mm_loadl_epi64 ( ( const __m128i * ) ( src + x ) ) ;
__m128i v1 = _mm_loadl_epi64 ( ( const __m128i * ) ( src + x + 8 ) ) ;
__m128i v2 = _mm_loadl_epi64 ( ( const __m128i * ) ( src + x + 16 ) ) , v3 ;
v0 = _mm_unpacklo_epi8 ( v0 , z ) ; // b0 g0 r0 b1 g1 r1 b2 g2
v1 = _mm_unpacklo_epi8 ( v1 , z ) ; // r2 b3 g3 r3 b4 g4 r4 b5
v2 = _mm_unpacklo_epi8 ( v2 , z ) ; // g5 r5 b6 g6 r6 b7 g7 r7
v3 = _mm_srli_si128 ( v2 , 2 ) ; // ? b6 g6 r6 b7 g7 r7 0
v2 = _mm_or_si128 ( _mm_slli_si128 ( v2 , 10 ) , _mm_srli_si128 ( v1 , 6 ) ) ; // ? b4 g4 r4 b5 g5 r5 ?
v1 = _mm_or_si128 ( _mm_slli_si128 ( v1 , 6 ) , _mm_srli_si128 ( v0 , 10 ) ) ; // ? b2 g2 r2 b3 g3 r3 ?
v0 = _mm_slli_si128 ( v0 , 2 ) ; // 0 b0 g0 r0 b1 g1 r1 ?
// process pixels 0 & 1
t0 = _mm_madd_epi16 ( v0 , m0 ) ; // a0 b0 a1 b1
t1 = _mm_madd_epi16 ( v0 , m1 ) ; // c0 d0 c1 d1
t2 = _mm_madd_epi16 ( v0 , m2 ) ; // e0 f0 e1 f1
v0 = _mm_unpacklo_epi32 ( t0 , t1 ) ; // a0 c0 b0 d0
t0 = _mm_unpackhi_epi32 ( t0 , t1 ) ; // a1 b1 c1 d1
t1 = _mm_unpacklo_epi32 ( t2 , z ) ; // e0 0 f0 0
t2 = _mm_unpackhi_epi32 ( t2 , z ) ; // e1 0 f1 0
r0 = _mm_add_epi32 ( _mm_add_epi32 ( _mm_unpacklo_epi64 ( v0 , t1 ) , _mm_unpackhi_epi64 ( v0 , t1 ) ) , m3 ) ; // B0 G0 R0 0
r1 = _mm_add_epi32 ( _mm_add_epi32 ( _mm_unpacklo_epi64 ( t0 , t2 ) , _mm_unpackhi_epi64 ( t0 , t2 ) ) , m3 ) ; // B1 G1 R1 0
r0 = _mm_srai_epi32 ( r0 , BITS ) ;
r1 = _mm_srai_epi32 ( r1 , BITS ) ;
v0 = _mm_packus_epi16 ( _mm_packs_epi32 ( _mm_slli_si128 ( r0 , 4 ) , r1 ) , z ) ; // 0 B0 G0 R0 B1 G1 R1 0
// process pixels 2 & 3
t0 = _mm_madd_epi16 ( v1 , m0 ) ; // a0 b0 a1 b1
t1 = _mm_madd_epi16 ( v1 , m1 ) ; // c0 d0 c1 d1
t2 = _mm_madd_epi16 ( v1 , m2 ) ; // e0 f0 e1 f1
v1 = _mm_unpacklo_epi32 ( t0 , t1 ) ; // a0 c0 b0 d0
t0 = _mm_unpackhi_epi32 ( t0 , t1 ) ; // a1 b1 c1 d1
t1 = _mm_unpacklo_epi32 ( t2 , z ) ; // e0 0 f0 0
t2 = _mm_unpackhi_epi32 ( t2 , z ) ; // e1 0 f1 0
r0 = _mm_add_epi32 ( _mm_add_epi32 ( _mm_unpacklo_epi64 ( v1 , t1 ) , _mm_unpackhi_epi64 ( v1 , t1 ) ) , m3 ) ; // B2 G2 R2 0
r1 = _mm_add_epi32 ( _mm_add_epi32 ( _mm_unpacklo_epi64 ( t0 , t2 ) , _mm_unpackhi_epi64 ( t0 , t2 ) ) , m3 ) ; // B3 G3 R3 0
r0 = _mm_srai_epi32 ( r0 , BITS ) ;
r1 = _mm_srai_epi32 ( r1 , BITS ) ;
v1 = _mm_packus_epi16 ( _mm_packs_epi32 ( _mm_slli_si128 ( r0 , 4 ) , r1 ) , z ) ; // 0 B2 G2 R2 B3 G3 R3 0
// process pixels 4 & 5
t0 = _mm_madd_epi16 ( v2 , m0 ) ; // a0 b0 a1 b1
t1 = _mm_madd_epi16 ( v2 , m1 ) ; // c0 d0 c1 d1
t2 = _mm_madd_epi16 ( v2 , m2 ) ; // e0 f0 e1 f1
v2 = _mm_unpacklo_epi32 ( t0 , t1 ) ; // a0 c0 b0 d0
t0 = _mm_unpackhi_epi32 ( t0 , t1 ) ; // a1 b1 c1 d1
t1 = _mm_unpacklo_epi32 ( t2 , z ) ; // e0 0 f0 0
t2 = _mm_unpackhi_epi32 ( t2 , z ) ; // e1 0 f1 0
r0 = _mm_add_epi32 ( _mm_add_epi32 ( _mm_unpacklo_epi64 ( v2 , t1 ) , _mm_unpackhi_epi64 ( v2 , t1 ) ) , m3 ) ; // B4 G4 R4 0
r1 = _mm_add_epi32 ( _mm_add_epi32 ( _mm_unpacklo_epi64 ( t0 , t2 ) , _mm_unpackhi_epi64 ( t0 , t2 ) ) , m3 ) ; // B5 G5 R5 0
r0 = _mm_srai_epi32 ( r0 , BITS ) ;
r1 = _mm_srai_epi32 ( r1 , BITS ) ;
v2 = _mm_packus_epi16 ( _mm_packs_epi32 ( _mm_slli_si128 ( r0 , 4 ) , r1 ) , z ) ; // 0 B4 G4 R4 B5 G5 R5 0
// process pixels 6 & 7
t0 = _mm_madd_epi16 ( v3 , m0 ) ; // a0 b0 a1 b1
t1 = _mm_madd_epi16 ( v3 , m1 ) ; // c0 d0 c1 d1
t2 = _mm_madd_epi16 ( v3 , m2 ) ; // e0 f0 e1 f1
v3 = _mm_unpacklo_epi32 ( t0 , t1 ) ; // a0 c0 b0 d0
t0 = _mm_unpackhi_epi32 ( t0 , t1 ) ; // a1 b1 c1 d1
t1 = _mm_unpacklo_epi32 ( t2 , z ) ; // e0 0 f0 0
t2 = _mm_unpackhi_epi32 ( t2 , z ) ; // e1 0 f1 0
r0 = _mm_add_epi32 ( _mm_add_epi32 ( _mm_unpacklo_epi64 ( v3 , t1 ) , _mm_unpackhi_epi64 ( v3 , t1 ) ) , m3 ) ; // B6 G6 R6 0
r1 = _mm_add_epi32 ( _mm_add_epi32 ( _mm_unpacklo_epi64 ( t0 , t2 ) , _mm_unpackhi_epi64 ( t0 , t2 ) ) , m3 ) ; // B7 G7 R7 0
r0 = _mm_srai_epi32 ( r0 , BITS ) ;
r1 = _mm_srai_epi32 ( r1 , BITS ) ;
v3 = _mm_packus_epi16 ( _mm_packs_epi32 ( _mm_slli_si128 ( r0 , 4 ) , r1 ) , z ) ; // 0 B6 G6 R6 B7 G7 R7 0
v0 = _mm_or_si128 ( _mm_srli_si128 ( v0 , 1 ) , _mm_slli_si128 ( v1 , 5 ) ) ;
v1 = _mm_or_si128 ( _mm_srli_si128 ( v1 , 3 ) , _mm_slli_si128 ( v2 , 3 ) ) ;
v2 = _mm_or_si128 ( _mm_srli_si128 ( v2 , 5 ) , _mm_slli_si128 ( v3 , 1 ) ) ;
_mm_storel_epi64 ( ( __m128i * ) ( dst + x ) , v0 ) ;
_mm_storel_epi64 ( ( __m128i * ) ( dst + x + 8 ) , v1 ) ;
_mm_storel_epi64 ( ( __m128i * ) ( dst + x + 16 ) , v2 ) ;
}
for ( ; x < len * 3 ; x + = 3 )
for ( ; x < = ( len - cWidth ) * nChannels ; x + = cWidth * nChannels )
{
// load 8 pixels
v_int16x8 v0 = v_reinterpret_as_s16 ( v_load_expand ( src + x ) ) ;
v_int16x8 v1 = v_reinterpret_as_s16 ( v_load_expand ( src + x + cWidth ) ) ;
v_int16x8 v2 = v_reinterpret_as_s16 ( v_load_expand ( src + x + cWidth * 2 ) ) ;
v_int16x8 v3 ;
// rotate and pack
v3 = v_rotate_right < 1 > ( v2 ) ; // 0 b6 g6 r6 b7 g7 r7 0
v2 = v_rotate_left < 5 > ( v2 , v1 ) ; // 0 b4 g4 r4 b5 g5 r5 0
v1 = v_rotate_left < 3 > ( v1 , v0 ) ; // 0 b2 g2 r2 b3 g3 r3 0
v0 = v_rotate_left < 1 > ( v0 ) ; // 0 b0 g0 r0 b1 g1 r1 0
// multiply with matrix and normalize
v0 = v_matmulvec ( v0 , m0 , m1 , m2 , m3 , BITS ) ; // 0 B0 G0 R0 B1 G1 R1 0
v1 = v_matmulvec ( v1 , m0 , m1 , m2 , m3 , BITS ) ; // 0 B2 G2 R2 B3 G3 R3 0
v2 = v_matmulvec ( v2 , m0 , m1 , m2 , m3 , BITS ) ; // 0 B4 G4 R4 B5 G5 R5 0
v3 = v_matmulvec ( v3 , m0 , m1 , m2 , m3 , BITS ) ; // 0 B6 G6 R6 B7 G7 R7 0
// narrow down as uint8x16
v_uint8x16 z0 = v_pack_u ( v0 , v_setzero_s16 ( ) ) ; // 0 B0 G0 R0 B1 G1 R1 0 0 0 0 0 0 0 0 0
v_uint8x16 z1 = v_pack_u ( v1 , v_setzero_s16 ( ) ) ; // 0 B2 G2 R2 B3 G3 R3 0 0 0 0 0 0 0 0 0
v_uint8x16 z2 = v_pack_u ( v2 , v_setzero_s16 ( ) ) ; // 0 B4 G4 R4 B5 G5 R5 0 0 0 0 0 0 0 0 0
v_uint8x16 z3 = v_pack_u ( v3 , v_setzero_s16 ( ) ) ; // 0 B6 G6 R6 B7 G7 R7 0 0 0 0 0 0 0 0 0
// rotate and pack
z0 = v_reinterpret_as_u8 ( v_reinterpret_as_u64 ( z0 ) > > 8 ) | v_reinterpret_as_u8 ( v_reinterpret_as_u64 ( z1 ) < < 40 ) ; // B0 G0 R0 B1 G1 R1 B2 G2 0 0 0 0 0 0 0 0
z1 = v_reinterpret_as_u8 ( v_reinterpret_as_u64 ( z1 ) > > 24 ) | v_reinterpret_as_u8 ( v_reinterpret_as_u64 ( z2 ) < < 24 ) ; // R2 B3 G3 R3 B4 G4 R4 B5 0 0 0 0 0 0 0 0
z2 = v_reinterpret_as_u8 ( v_reinterpret_as_u64 ( z2 ) > > 40 ) | v_reinterpret_as_u8 ( v_reinterpret_as_u64 ( z3 ) < < 8 ) ; // G5 R6 B6 G6 R6 B7 G7 R7 0 0 0 0 0 0 0 0
// store on memory
v_store_low ( dst + x , z0 ) ;
v_store_low ( dst + x + cWidth , z1 ) ;
v_store_low ( dst + x + cWidth * 2 , z2 ) ;
}
for ( ; x < len * nChannels ; x + = nChannels )
{
int v0 = src [ x ] , v1 = src [ x + 1 ] , v2 = src [ x + 2 ] ;
uchar t0 = saturate_cast < uchar > ( ( m00 * v0 + m01 * v1 + m02 * v2 + m03 ) > > BITS ) ;
@ -1846,61 +1816,63 @@ transform_8u( const uchar* src, uchar* dst, const float* m, int len, int scn, in
static void
transform_16u ( const ushort * src , ushort * dst , const float * m , int len , int scn , int dcn )
{
# if CV_SSE2
if ( USE_SSE2 & & scn = = 3 & & dcn = = 3 )
# if CV_SIMD128 && !defined(__aarch64__)
if ( hasSIMD128 ( ) & & scn = = 3 & & dcn = = 3 )
{
__m128 m0 , m1 , m2 , m3 ;
__m128i delta = _mm_setr_epi16 ( 0 , - 32768 , - 32768 , - 32768 , - 32768 , - 32768 , - 32768 , 0 ) ;
const int nChannels = 3 ;
const int cWidth = v_float32x4 : : nlanes ;
v_int16x8 delta = v_int16x8 ( 0 , - 32768 , - 32768 , - 32768 , - 32768 , - 32768 , - 32768 , 0 ) ;
v_float32x4 m0 , m1 , m2 , m3 ;
load3x3Matrix ( m , m0 , m1 , m2 , m3 ) ;
m3 = _mm_sub_ps ( m3 , _mm_setr_ps ( 32768.f , 32768.f , 32768.f , 0.f ) ) ;
m3 - = v_float32x4 ( 32768.f , 32768.f , 32768.f , 0.f ) ;
int x = 0 ;
for ( ; x < = ( len - 4 ) * 3 ; x + = 4 * 3 )
{
__m128i z = _mm_setzero_si128 ( ) ;
__m128i v0 = _mm_loadu_si128 ( ( const __m128i * ) ( src + x ) ) , v1 ;
__m128i v2 = _mm_loadl_epi64 ( ( const __m128i * ) ( src + x + 8 ) ) , v3 ;
v1 = _mm_unpacklo_epi16 ( _mm_srli_si128 ( v0 , 6 ) , z ) ; // b1 g1 r1
v3 = _mm_unpacklo_epi16 ( _mm_srli_si128 ( v2 , 2 ) , z ) ; // b3 g3 r3
v2 = _mm_or_si128 ( _mm_srli_si128 ( v0 , 12 ) , _mm_slli_si128 ( v2 , 4 ) ) ;
v0 = _mm_unpacklo_epi16 ( v0 , z ) ; // b0 g0 r0
v2 = _mm_unpacklo_epi16 ( v2 , z ) ; // b2 g2 r2
__m128 x0 = _mm_cvtepi32_ps ( v0 ) , x1 = _mm_cvtepi32_ps ( v1 ) ;
__m128 x2 = _mm_cvtepi32_ps ( v2 ) , x3 = _mm_cvtepi32_ps ( v3 ) ;
__m128 y0 = _mm_add_ps ( _mm_add_ps ( _mm_add_ps (
_mm_mul_ps ( m0 , _mm_shuffle_ps ( x0 , x0 , _MM_SHUFFLE ( 0 , 0 , 0 , 0 ) ) ) ,
_mm_mul_ps ( m1 , _mm_shuffle_ps ( x0 , x0 , _MM_SHUFFLE ( 1 , 1 , 1 , 1 ) ) ) ) ,
_mm_mul_ps ( m2 , _mm_shuffle_ps ( x0 , x0 , _MM_SHUFFLE ( 2 , 2 , 2 , 2 ) ) ) ) , m3 ) ;
__m128 y1 = _mm_add_ps ( _mm_add_ps ( _mm_add_ps (
_mm_mul_ps ( m0 , _mm_shuffle_ps ( x1 , x1 , _MM_SHUFFLE ( 0 , 0 , 0 , 0 ) ) ) ,
_mm_mul_ps ( m1 , _mm_shuffle_ps ( x1 , x1 , _MM_SHUFFLE ( 1 , 1 , 1 , 1 ) ) ) ) ,
_mm_mul_ps ( m2 , _mm_shuffle_ps ( x1 , x1 , _MM_SHUFFLE ( 2 , 2 , 2 , 2 ) ) ) ) , m3 ) ;
__m128 y2 = _mm_add_ps ( _mm_add_ps ( _mm_add_ps (
_mm_mul_ps ( m0 , _mm_shuffle_ps ( x2 , x2 , _MM_SHUFFLE ( 0 , 0 , 0 , 0 ) ) ) ,
_mm_mul_ps ( m1 , _mm_shuffle_ps ( x2 , x2 , _MM_SHUFFLE ( 1 , 1 , 1 , 1 ) ) ) ) ,
_mm_mul_ps ( m2 , _mm_shuffle_ps ( x2 , x2 , _MM_SHUFFLE ( 2 , 2 , 2 , 2 ) ) ) ) , m3 ) ;
__m128 y3 = _mm_add_ps ( _mm_add_ps ( _mm_add_ps (
_mm_mul_ps ( m0 , _mm_shuffle_ps ( x3 , x3 , _MM_SHUFFLE ( 0 , 0 , 0 , 0 ) ) ) ,
_mm_mul_ps ( m1 , _mm_shuffle_ps ( x3 , x3 , _MM_SHUFFLE ( 1 , 1 , 1 , 1 ) ) ) ) ,
_mm_mul_ps ( m2 , _mm_shuffle_ps ( x3 , x3 , _MM_SHUFFLE ( 2 , 2 , 2 , 2 ) ) ) ) , m3 ) ;
v0 = _mm_cvtps_epi32 ( y0 ) ; v1 = _mm_cvtps_epi32 ( y1 ) ;
v2 = _mm_cvtps_epi32 ( y2 ) ; v3 = _mm_cvtps_epi32 ( y3 ) ;
v0 = _mm_add_epi16 ( _mm_packs_epi32 ( _mm_slli_si128 ( v0 , 4 ) , v1 ) , delta ) ; // 0 b0 g0 r0 b1 g1 r1 0
v2 = _mm_add_epi16 ( _mm_packs_epi32 ( _mm_slli_si128 ( v2 , 4 ) , v3 ) , delta ) ; // 0 b2 g2 r2 b3 g3 r3 0
v1 = _mm_or_si128 ( _mm_srli_si128 ( v0 , 2 ) , _mm_slli_si128 ( v2 , 10 ) ) ; // b0 g0 r0 b1 g1 r1 b2 g2
v2 = _mm_srli_si128 ( v2 , 6 ) ; // r2 b3 g3 r3 0 0 0 0
_mm_storeu_si128 ( ( __m128i * ) ( dst + x ) , v1 ) ;
_mm_storel_epi64 ( ( __m128i * ) ( dst + x + 8 ) , v2 ) ;
}
for ( ; x < len * 3 ; x + = 3 )
{
float v0 = src [ x ] , v1 = src [ x + 1 ] , v2 = src [ x + 2 ] ;
ushort t0 = saturate_cast < ushort > ( m [ 0 ] * v0 + m [ 1 ] * v1 + m [ 2 ] * v2 + m [ 3 ] ) ;
ushort t1 = saturate_cast < ushort > ( m [ 4 ] * v0 + m [ 5 ] * v1 + m [ 6 ] * v2 + m [ 7 ] ) ;
ushort t2 = saturate_cast < ushort > ( m [ 8 ] * v0 + m [ 9 ] * v1 + m [ 10 ] * v2 + m [ 11 ] ) ;
dst [ x ] = t0 ; dst [ x + 1 ] = t1 ; dst [ x + 2 ] = t2 ;
for ( ; x < = ( len - cWidth ) * nChannels ; x + = cWidth * nChannels )
{
// load 4 pixels
v_uint16x8 v0_16 = v_load ( src + x ) ; // b0 g0 r0 b1 g1 r1 b2 g2
v_uint16x8 v2_16 = v_load_low ( src + x + cWidth * 2 ) ; // r2 b3 g3 r3 ? ? ? ?
// expand to 4 vectors
v_uint32x4 v0_32 , v1_32 , v2_32 , v3_32 , dummy_32 ;
v_expand ( v_rotate_right < 3 > ( v0_16 ) , v1_32 , dummy_32 ) ; // b1 g1 r1
v_expand ( v_rotate_right < 1 > ( v2_16 ) , v3_32 , dummy_32 ) ; // b3 g3 r3
v_expand ( v_rotate_right < 6 > ( v0_16 , v2_16 ) , v2_32 , dummy_32 ) ; // b2 g2 r2
v_expand ( v0_16 , v0_32 , dummy_32 ) ; // b0 g0 r0
// convert to float32x4
v_float32x4 x0 = v_cvt_f32 ( v_reinterpret_as_s32 ( v0_32 ) ) ; // b0 g0 r0
v_float32x4 x1 = v_cvt_f32 ( v_reinterpret_as_s32 ( v1_32 ) ) ; // b1 g1 r1
v_float32x4 x2 = v_cvt_f32 ( v_reinterpret_as_s32 ( v2_32 ) ) ; // b2 g2 r2
v_float32x4 x3 = v_cvt_f32 ( v_reinterpret_as_s32 ( v3_32 ) ) ; // b3 g3 r3
// multiply and convert back to int32x4
v_int32x4 y0 , y1 , y2 , y3 ;
y0 = v_round ( v_matmuladd ( x0 , m0 , m1 , m2 , m3 ) ) ; // B0 G0 R0
y1 = v_round ( v_matmuladd ( x1 , m0 , m1 , m2 , m3 ) ) ; // B1 G1 R1
y2 = v_round ( v_matmuladd ( x2 , m0 , m1 , m2 , m3 ) ) ; // B2 G2 R2
y3 = v_round ( v_matmuladd ( x3 , m0 , m1 , m2 , m3 ) ) ; // B3 G3 R3
// narrow down to int16x8
v_int16x8 v0 = v_add_wrap ( v_pack ( v_rotate_left < 1 > ( y0 ) , y1 ) , delta ) ; // 0 B0 G0 R0 B1 G1 R1 0
v_int16x8 v2 = v_add_wrap ( v_pack ( v_rotate_left < 1 > ( y2 ) , y3 ) , delta ) ; // 0 B2 G2 R2 B3 G3 R3 0
// rotate and pack
v0 = v_rotate_right < 1 > ( v0 ) | v_rotate_left < 5 > ( v2 ) ; // B0 G0 R0 B1 G1 R1 B2 G2
v2 = v_rotate_right < 3 > ( v2 ) ; // R2 B3 G3 R3 0 0 0 0
// store 4 pixels
v_store ( dst + x , v_reinterpret_as_u16 ( v0 ) ) ;
v_store_low ( dst + x + cWidth * 2 , v_reinterpret_as_u16 ( v2 ) ) ;
}
for ( ; x < len * nChannels ; x + = nChannels )
{
float v0 = src [ x ] , v1 = src [ x + 1 ] , v2 = src [ x + 2 ] ;
ushort t0 = saturate_cast < ushort > ( m [ 0 ] * v0 + m [ 1 ] * v1 + m [ 2 ] * v2 + m [ 3 ] ) ;
ushort t1 = saturate_cast < ushort > ( m [ 4 ] * v0 + m [ 5 ] * v1 + m [ 6 ] * v2 + m [ 7 ] ) ;
ushort t2 = saturate_cast < ushort > ( m [ 8 ] * v0 + m [ 9 ] * v1 + m [ 10 ] * v2 + m [ 11 ] ) ;
dst [ x ] = t0 ; dst [ x + 1 ] = t1 ; dst [ x + 2 ] = t2 ;
}
return ;
}
@ -1909,31 +1881,28 @@ transform_16u( const ushort* src, ushort* dst, const float* m, int len, int scn,
transform_ ( src , dst , m , len , scn , dcn ) ;
}
static void
transform_32f ( const float * src , float * dst , const float * m , int len , int scn , int dcn )
{
# if CV_SSE2
if ( USE_SSE2 )
# if CV_SIMD128 && !defined(__aarch64__)
if ( hasSIMD128 ( ) )
{
int x = 0 ;
if ( scn = = 3 & & dcn = = 3 )
{
__m128 m0 , m1 , m2 , m3 ;
const int cWidth = 3 ;
v_float32x4 m0 , m1 , m2 , m3 ;
load3x3Matrix ( m , m0 , m1 , m2 , m3 ) ;
for ( ; x < ( len - 1 ) * 3 ; x + = 3 )
for ( ; x < ( len - 1 ) * cWidth ; x + = cWidth )
{
__m128 x0 = _mm_loadu_ps ( src + x ) ;
__m128 y0 = _mm_add_ps ( _mm_add_ps ( _mm_add_ps (
_mm_mul_ps ( m0 , _mm_shuffle_ps ( x0 , x0 , _MM_SHUFFLE ( 0 , 0 , 0 , 0 ) ) ) ,
_mm_mul_ps ( m1 , _mm_shuffle_ps ( x0 , x0 , _MM_SHUFFLE ( 1 , 1 , 1 , 1 ) ) ) ) ,
_mm_mul_ps ( m2 , _mm_shuffle_ps ( x0 , x0 , _MM_SHUFFLE ( 2 , 2 , 2 , 2 ) ) ) ) , m3 ) ;
_mm_storel_pi ( ( __m64 * ) ( dst + x ) , y0 ) ;
_mm_store_ss ( dst + x + 2 , _mm_movehl_ps ( y0 , y0 ) ) ;
v_float32x4 x0 = v_load ( src + x ) ;
v_float32x4 y0 = v_matmuladd ( x0 , m0 , m1 , m2 , m3 ) ;
v_store_low ( dst + x , y0 ) ;
dst [ x + 2 ] = v_combine_high ( y0 , y0 ) . get0 ( ) ;
}
for ( ; x < len * 3 ; x + = 3 )
for ( ; x < len * cWidth ; x + = cWidth )
{
float v0 = src [ x ] , v1 = src [ x + 1 ] , v2 = src [ x + 2 ] ;
float t0 = saturate_cast < float > ( m [ 0 ] * v0 + m [ 1 ] * v1 + m [ 2 ] * v2 + m [ 3 ] ) ;
@ -1946,18 +1915,18 @@ transform_32f( const float* src, float* dst, const float* m, int len, int scn, i
if ( scn = = 4 & & dcn = = 4 )
{
__m128 m0 , m1 , m2 , m3 , m4 ;
load4x4Matrix ( m , m0 , m1 , m2 , m3 , m4 ) ;
const int cWidth = 4 ;
v_float32x4 m0 = v_float32x4 ( m [ 0 ] , m [ 5 ] , m [ 10 ] , m [ 15 ] ) ;
v_float32x4 m1 = v_float32x4 ( m [ 1 ] , m [ 6 ] , m [ 11 ] , m [ 16 ] ) ;
v_float32x4 m2 = v_float32x4 ( m [ 2 ] , m [ 7 ] , m [ 12 ] , m [ 17 ] ) ;
v_float32x4 m3 = v_float32x4 ( m [ 3 ] , m [ 8 ] , m [ 13 ] , m [ 18 ] ) ;
v_float32x4 m4 = v_float32x4 ( m [ 4 ] , m [ 9 ] , m [ 14 ] , m [ 19 ] ) ;
for ( ; x < len * 4 ; x + = 4 )
for ( ; x < len * cWidth ; x + = cWidth )
{
__m128 x0 = _mm_loadu_ps ( src + x ) ;
__m128 y0 = _mm_add_ps ( _mm_add_ps ( _mm_add_ps ( _mm_add_ps (
_mm_mul_ps ( m0 , _mm_shuffle_ps ( x0 , x0 , _MM_SHUFFLE ( 0 , 0 , 0 , 0 ) ) ) ,
_mm_mul_ps ( m1 , _mm_shuffle_ps ( x0 , x0 , _MM_SHUFFLE ( 1 , 1 , 1 , 1 ) ) ) ) ,
_mm_mul_ps ( m2 , _mm_shuffle_ps ( x0 , x0 , _MM_SHUFFLE ( 2 , 2 , 2 , 2 ) ) ) ) ,
_mm_mul_ps ( m3 , _mm_shuffle_ps ( x0 , x0 , _MM_SHUFFLE ( 3 , 3 , 3 , 3 ) ) ) ) , m4 ) ;
_mm_storeu_ps ( dst + x , y0 ) ;
v_float32x4 x0 = v_load ( src + x ) ;
v_float32x4 y0 = v_matmul ( x0 , m0 , m1 , m2 , m3 ) + m4 ;
v_store ( dst + x , y0 ) ;
}
return ;
}
@ -2342,58 +2311,21 @@ static void scaleAdd_32f(const float* src1, const float* src2, float* dst,
{
float alpha = * _alpha ;
int i = 0 ;
# if CV_SSE2
if ( USE_SSE2 )
# if CV_SIMD128
if ( hasSIMD128 ( ) )
{
__m128 a4 = _mm_set1_ps ( alpha ) ;
if ( ( ( ( size_t ) src1 | ( size_t ) src2 | ( size_t ) dst ) & 15 ) = = 0 )
for ( ; i < = len - 8 ; i + = 8 )
{
__m128 x0 , x1 , y0 , y1 , t0 , t1 ;
x0 = _mm_load_ps ( src1 + i ) ; x1 = _mm_load_ps ( src1 + i + 4 ) ;
y0 = _mm_load_ps ( src2 + i ) ; y1 = _mm_load_ps ( src2 + i + 4 ) ;
t0 = _mm_add_ps ( _mm_mul_ps ( x0 , a4 ) , y0 ) ;
t1 = _mm_add_ps ( _mm_mul_ps ( x1 , a4 ) , y1 ) ;
_mm_store_ps ( dst + i , t0 ) ;
_mm_store_ps ( dst + i + 4 , t1 ) ;
}
else
for ( ; i < = len - 8 ; i + = 8 )
{
__m128 x0 , x1 , y0 , y1 , t0 , t1 ;
x0 = _mm_loadu_ps ( src1 + i ) ; x1 = _mm_loadu_ps ( src1 + i + 4 ) ;
y0 = _mm_loadu_ps ( src2 + i ) ; y1 = _mm_loadu_ps ( src2 + i + 4 ) ;
t0 = _mm_add_ps ( _mm_mul_ps ( x0 , a4 ) , y0 ) ;
t1 = _mm_add_ps ( _mm_mul_ps ( x1 , a4 ) , y1 ) ;
_mm_storeu_ps ( dst + i , t0 ) ;
_mm_storeu_ps ( dst + i + 4 , t1 ) ;
}
}
else
# elif CV_NEON
if ( true )
{
for ( ; i < = len - 4 ; i + = 4 )
v_float32x4 v_alpha = v_setall_f32 ( alpha ) ;
const int cWidth = v_float32x4 : : nlanes ;
for ( ; i < = len - cWidth ; i + = cWidth )
{
float32x4_t v_src1 = vld1q_f32 ( src1 + i ) , v_src2 = vld1q_f32 ( src2 + i ) ;
vst1q_f32 ( dst + i , vaddq_f32 ( vmulq_n_f32 ( v_src1 , alpha ) , v_src2 ) ) ;
v_float32x4 v_src1 = v_load ( src1 + i ) ;
v_float32x4 v_src2 = v_load ( src2 + i ) ;
v_store ( dst + i , ( v_src1 * v_alpha ) + v_src2 ) ;
}
}
else
# endif
//vz why do we need unroll here?
for ( ; i < = len - 4 ; i + = 4 )
{
float t0 , t1 ;
t0 = src1 [ i ] * alpha + src2 [ i ] ;
t1 = src1 [ i + 1 ] * alpha + src2 [ i + 1 ] ;
dst [ i ] = t0 ; dst [ i + 1 ] = t1 ;
t0 = src1 [ i + 2 ] * alpha + src2 [ i + 2 ] ;
t1 = src1 [ i + 3 ] * alpha + src2 [ i + 3 ] ;
dst [ i + 2 ] = t0 ; dst [ i + 3 ] = t1 ;
}
for ( ; i < len ; i + + )
dst [ i ] = src1 [ i ] * alpha + src2 [ i ] ;
for ( ; i < len ; i + + )
dst [ i ] = src1 [ i ] * alpha + src2 [ i ] ;
}
@ -2402,36 +2334,25 @@ static void scaleAdd_64f(const double* src1, const double* src2, double* dst,
{
double alpha = * _alpha ;
int i = 0 ;
# if CV_SSE2
if ( USE_SSE2 & & ( ( ( size_t ) src1 | ( size_t ) src2 | ( size_t ) dst ) & 15 ) = = 0 )
# if CV_SIMD128_64F
if ( hasSIMD128 ( ) )
{
__m128d a2 = _mm_set1_pd ( alpha ) ;
for ( ; i < = len - 4 ; i + = 4 )
v_float64x2 a2 = v_setall_f64 ( alpha ) ;
const int cWidth = v_float64x2 : : nlanes ;
for ( ; i < = len - cWidth * 2 ; i + = cWidth * 2 )
{
__m128d x0 , x1 , y0 , y1 , t0 , t1 ;
x0 = _mm_load_p d( src1 + i ) ; x1 = _mm_load_p d( src1 + i + 2 ) ;
y0 = _mm_load_p d( src2 + i ) ; y1 = _mm_load_p d( src2 + i + 2 ) ;
t0 = _mm_add_pd ( _mm_mul_pd ( x0 , a2 ) , y0 ) ;
t1 = _mm_add_pd ( _mm_mul_pd ( x1 , a2 ) , y1 ) ;
_mm_store_pd ( dst + i , t0 ) ;
_mm_store_pd ( dst + i + 2 , t1 ) ;
v_float64x2 x0 , x1 , y0 , y1 , t0 , t1 ;
x0 = v_loa d( src1 + i ) ; x1 = v_loa d( src1 + i + cWidth ) ;
y0 = v_loa d( src2 + i ) ; y1 = v_loa d( src2 + i + cWidth ) ;
t0 = x0 * a2 + y0 ;
t1 = x1 * a2 + y1 ;
v_store ( dst + i , t0 ) ;
v_store ( dst + i + cWidth , t1 ) ;
}
}
else
# endif
//vz why do we need unroll here?
for ( ; i < = len - 4 ; i + = 4 )
{
double t0 , t1 ;
t0 = src1 [ i ] * alpha + src2 [ i ] ;
t1 = src1 [ i + 1 ] * alpha + src2 [ i + 1 ] ;
dst [ i ] = t0 ; dst [ i + 1 ] = t1 ;
t0 = src1 [ i + 2 ] * alpha + src2 [ i + 2 ] ;
t1 = src1 [ i + 3 ] * alpha + src2 [ i + 3 ] ;
dst [ i + 2 ] = t0 ; dst [ i + 3 ] = t1 ;
}
for ( ; i < len ; i + + )
dst [ i ] = src1 [ i ] * alpha + src2 [ i ] ;
for ( ; i < len ; i + + )
dst [ i ] = src1 [ i ] * alpha + src2 [ i ] ;
}
typedef void ( * ScaleAddFunc ) ( const uchar * src1 , const uchar * src2 , uchar * dst , int len , const void * alpha ) ;
@ -3105,43 +3026,36 @@ static double dotProd_8u(const uchar* src1, const uchar* src2, int len)
# endif
int i = 0 ;
# if CV_SSE2
if ( USE_SSE2 )
# if CV_SIMD128
if ( hasSIMD128 ( ) )
{
int j , len0 = len & - 4 , blockSize0 = ( 1 < < 13 ) , blockSize ;
__m128i z = _mm_setzero_si128 ( ) ;
CV_DECL_ALIGNED ( 16 ) int buf [ 4 ] ;
int len0 = len & - 8 , blockSize0 = ( 1 < < 15 ) , blockSize ;
while ( i < len0 )
while ( i < len0 )
{
blockSize = std : : min ( len0 - i , blockSize0 ) ;
__m128i s = z ;
j = 0 ;
for ( ; j < = blockSize - 16 ; j + = 16 )
v_int32x4 v_sum = v_setzero_s32 ( ) ;
const int cWidth = v_uint16x8 : : nlanes ;
int j = 0 ;
for ( ; j < = blockSize - cWidth * 2 ; j + = cWidth * 2 )
{
__m128i b0 = _mm_loadu_si128 ( ( const __m128i * ) ( src1 + j ) ) ;
__m128i b1 = _mm_loadu_si128 ( ( const __m128i * ) ( src2 + j ) ) ;
__m128i s0 , s1 , s2 , s3 ;
s0 = _mm_unpacklo_epi8 ( b0 , z ) ;
s2 = _mm_unpackhi_epi8 ( b0 , z ) ;
s1 = _mm_unpacklo_epi8 ( b1 , z ) ;
s3 = _mm_unpackhi_epi8 ( b1 , z ) ;
s0 = _mm_madd_epi16 ( s0 , s1 ) ;
s2 = _mm_madd_epi16 ( s2 , s3 ) ;
s = _mm_add_epi32 ( s , s0 ) ;
s = _mm_add_epi32 ( s , s2 ) ;
v_uint16x8 v_src10 , v_src20 , v_src11 , v_src21 ;
v_expand ( v_load ( src1 + j ) , v_src10 , v_src11 ) ;
v_expand ( v_load ( src2 + j ) , v_src20 , v_src21 ) ;
v_sum + = v_dotprod ( v_reinterpret_as_s16 ( v_src10 ) , v_reinterpret_as_s16 ( v_src20 ) ) ;
v_sum + = v_dotprod ( v_reinterpret_as_s16 ( v_src11 ) , v_reinterpret_as_s16 ( v_src21 ) ) ;
}
for ( ; j < blockSize ; j + = 4 )
for ( ; j < = blockSize - cWidth ; j + = cWidth )
{
__m128i s0 = _mm_unpacklo_epi8 ( _mm_cvtsi32_si128 ( * ( const int * ) ( src1 + j ) ) , z ) ;
__m128i s1 = _mm_unpacklo_epi8 ( _mm_cvtsi32_si128 ( * ( const int * ) ( src2 + j ) ) , z ) ;
s0 = _mm_madd_epi16 ( s0 , s1 ) ;
s = _mm_add_epi32 ( s , s0 ) ;
}
v_int16x8 v_src10 = v_reinterpret_as_s16 ( v_load_expand ( src1 + j ) ) ;
v_int16x8 v_src20 = v_reinterpret_as_s16 ( v_load_expand ( src2 + j ) ) ;
_mm_store_si128 ( ( __m128i * ) buf , s ) ;
r + = buf [ 0 ] + buf [ 1 ] + buf [ 2 ] + buf [ 3 ] ;
v_sum + = v_dotprod ( v_src10 , v_src20 ) ;
}
r + = ( double ) v_reduce_sum ( v_sum ) ;
src1 + = blockSize ;
src2 + = blockSize ;
@ -3149,43 +3063,46 @@ static double dotProd_8u(const uchar* src1, const uchar* src2, int len)
}
}
# elif CV_NEON
int len0 = len & - 8 , blockSize0 = ( 1 < < 15 ) , blockSize ;
uint32x4_t v_zero = vdupq_n_u32 ( 0u ) ;
CV_DECL_ALIGNED ( 16 ) uint buf [ 4 ] ;
while ( i < len0 )
if ( cv : : checkHardwareSupport ( CV_CPU_NEON ) )
{
blockSize = std : : min ( len0 - i , blockSize0 ) ;
uint32x4_t v_sum = v_zero ;
int len0 = len & - 8 , blockSize0 = ( 1 < < 15 ) , blockSize ;
uint32x4_t v_zero = vdupq_n_u32 ( 0u ) ;
CV_DECL_ALIGNED ( 16 ) uint buf [ 4 ] ;
int j = 0 ;
for ( ; j < = blockSize - 16 ; j + = 16 )
while ( i < len0 )
{
uint8x16_t v_src1 = vld1q_u8 ( src1 + j ) , v_src2 = vld1q_u8 ( src2 + j ) ;
blockSize = std : : min ( len0 - i , blockSize0 ) ;
uint32x4_t v_sum = v_zero ;
uint16x8_t v_src10 = vmovl_u8 ( vget_low_u8 ( v_src1 ) ) , v_src20 = vmovl_u8 ( vget_low_u8 ( v_src2 ) ) ;
v_sum = vmlal_u16 ( v_sum , vget_low_u16 ( v_src10 ) , vget_low_u16 ( v_src20 ) ) ;
v_sum = vmlal_u16 ( v_sum , vget_high_u16 ( v_src10 ) , vget_high_u16 ( v_src20 ) ) ;
int j = 0 ;
for ( ; j < = blockSize - 16 ; j + = 16 )
{
uint8x16_t v_src1 = vld1q_u8 ( src1 + j ) , v_src2 = vld1q_u8 ( src2 + j ) ;
v_src10 = vmovl_u8 ( vget_high_u8 ( v_src1 ) ) ;
v_src20 = vmovl_u8 ( vget_high_u8 ( v_src2 ) ) ;
v_sum = vmlal_u16 ( v_sum , vget_low_u16 ( v_src10 ) , vget_low_u16 ( v_src20 ) ) ;
v_sum = vmlal_u16 ( v_sum , vget_high_u16 ( v_src10 ) , vget_high_u16 ( v_src20 ) ) ;
}
uint16x8_t v_src10 = vmovl_u8 ( vget_low_u8 ( v_src1 ) ) , v_src20 = vmovl_u8 ( vget_low_u8 ( v_src2 ) ) ;
v_sum = vmlal_u16 ( v_sum , vget_low_u16 ( v_src10 ) , vget_low_u16 ( v_src20 ) ) ;
v_sum = vmlal_u16 ( v_sum , vget_high_u16 ( v_src10 ) , vget_high_u16 ( v_src20 ) ) ;
for ( ; j < = blockSize - 8 ; j + = 8 )
{
uint16x8_t v_src1 = vmovl_u8 ( vld1_u8 ( src1 + j ) ) , v_src2 = vmovl_u8 ( vld1_u8 ( src2 + j ) ) ;
v_sum = vmlal_u16 ( v_sum , vget_low_u16 ( v_src1 ) , vget_low_u16 ( v_src2 ) ) ;
v_sum = vmlal_u16 ( v_sum , vget_high_u16 ( v_src1 ) , vget_high_u16 ( v_src2 ) ) ;
}
v_src10 = vmovl_u8 ( vget_high_u8 ( v_src1 ) ) ;
v_src20 = vmovl_u8 ( vget_high_u8 ( v_src2 ) ) ;
v_sum = vmlal_u16 ( v_sum , vget_low_u16 ( v_src10 ) , vget_low_u16 ( v_src20 ) ) ;
v_sum = vmlal_u16 ( v_sum , vget_high_u16 ( v_src10 ) , vget_high_u16 ( v_src20 ) ) ;
}
vst1q_u32 ( buf , v_sum ) ;
r + = buf [ 0 ] + buf [ 1 ] + buf [ 2 ] + buf [ 3 ] ;
for ( ; j < = blockSize - 8 ; j + = 8 )
{
uint16x8_t v_src1 = vmovl_u8 ( vld1_u8 ( src1 + j ) ) , v_src2 = vmovl_u8 ( vld1_u8 ( src2 + j ) ) ;
v_sum = vmlal_u16 ( v_sum , vget_low_u16 ( v_src1 ) , vget_low_u16 ( v_src2 ) ) ;
v_sum = vmlal_u16 ( v_sum , vget_high_u16 ( v_src1 ) , vget_high_u16 ( v_src2 ) ) ;
}
src1 + = blockSize ;
src2 + = blockSize ;
i + = blockSize ;
vst1q_u32 ( buf , v_sum ) ;
r + = buf [ 0 ] + buf [ 1 ] + buf [ 2 ] + buf [ 3 ] ;
src1 + = blockSize ;
src2 + = blockSize ;
i + = blockSize ;
}
}
# endif
return r + dotProd_ ( src1 , src2 , len - i ) ;
@ -3194,48 +3111,39 @@ static double dotProd_8u(const uchar* src1, const uchar* src2, int len)
static double dotProd_8s ( const schar * src1 , const schar * src2 , int len )
{
int i = 0 ;
double r = 0.0 ;
int i = 0 ;
# if CV_SSE2
if ( USE_SSE2 )
# if CV_SIMD128
if ( hasSIMD128 ( ) )
{
int j , len0 = len & - 4 , blockSize0 = ( 1 < < 13 ) , blockSize ;
__m128i z = _mm_setzero_si128 ( ) ;
CV_DECL_ALIGNED ( 16 ) int buf [ 4 ] ;
int len0 = len & - 8 , blockSize0 = ( 1 < < 14 ) , blockSize ;
while ( i < len0 )
while ( i < len0 )
{
blockSize = std : : min ( len0 - i , blockSize0 ) ;
__m128i s = z ;
j = 0 ;
for ( ; j < = blockSize - 16 ; j + = 16 )
v_int32x4 v_sum = v_setzero_s32 ( ) ;
const int cWidth = v_int16x8 : : nlanes ;
int j = 0 ;
for ( ; j < = blockSize - cWidth * 2 ; j + = cWidth * 2 )
{
__m128i b0 = _mm_loadu_si128 ( ( const __m128i * ) ( src1 + j ) ) ;
__m128i b1 = _mm_loadu_si128 ( ( const __m128i * ) ( src2 + j ) ) ;
__m128i s0 , s1 , s2 , s3 ;
s0 = _mm_srai_epi16 ( _mm_unpacklo_epi8 ( b0 , b0 ) , 8 ) ;
s2 = _mm_srai_epi16 ( _mm_unpackhi_epi8 ( b0 , b0 ) , 8 ) ;
s1 = _mm_srai_epi16 ( _mm_unpacklo_epi8 ( b1 , b1 ) , 8 ) ;
s3 = _mm_srai_epi16 ( _mm_unpackhi_epi8 ( b1 , b1 ) , 8 ) ;
s0 = _mm_madd_epi16 ( s0 , s1 ) ;
s2 = _mm_madd_epi16 ( s2 , s3 ) ;
s = _mm_add_epi32 ( s , s0 ) ;
s = _mm_add_epi32 ( s , s2 ) ;
v_int16x8 v_src10 , v_src20 , v_src11 , v_src21 ;
v_expand ( v_load ( src1 + j ) , v_src10 , v_src11 ) ;
v_expand ( v_load ( src2 + j ) , v_src20 , v_src21 ) ;
v_sum + = v_dotprod ( v_src10 , v_src20 ) ;
v_sum + = v_dotprod ( v_src11 , v_src21 ) ;
}
for ( ; j < blockSize ; j + = 4 )
for ( ; j < = blockSize - cWidth ; j + = cWidth )
{
__m128i s0 = _mm_cvtsi32_si128 ( * ( const int * ) ( src1 + j ) ) ;
__m128i s1 = _mm_cvtsi32_si128 ( * ( const int * ) ( src2 + j ) ) ;
s0 = _mm_srai_epi16 ( _mm_unpacklo_epi8 ( s0 , s0 ) , 8 ) ;
s1 = _mm_srai_epi16 ( _mm_unpacklo_epi8 ( s1 , s1 ) , 8 ) ;
s0 = _mm_madd_epi16 ( s0 , s1 ) ;
s = _mm_add_epi32 ( s , s0 ) ;
}
v_int16x8 v_src10 = v_load_expand ( src1 + j ) ;
v_int16x8 v_src20 = v_load_expand ( src2 + j ) ;
_mm_store_si128 ( ( __m128i * ) buf , s ) ;
r + = buf [ 0 ] + buf [ 1 ] + buf [ 2 ] + buf [ 3 ] ;
v_sum + = v_dotprod ( v_src10 , v_src20 ) ;
}
r + = ( double ) v_reduce_sum ( v_sum ) ;
src1 + = blockSize ;
src2 + = blockSize ;
@ -3243,43 +3151,46 @@ static double dotProd_8s(const schar* src1, const schar* src2, int len)
}
}
# elif CV_NEON
int len0 = len & - 8 , blockSize0 = ( 1 < < 14 ) , blockSize ;
int32x4_t v_zero = vdupq_n_s32 ( 0 ) ;
CV_DECL_ALIGNED ( 16 ) int buf [ 4 ] ;
while ( i < len0 )
if ( cv : : checkHardwareSupport ( CV_CPU_NEON ) )
{
blockSize = std : : min ( len0 - i , blockSize0 ) ;
int32x4_t v_sum = v_zero ;
int len0 = len & - 8 , blockSize0 = ( 1 < < 14 ) , blockSize ;
int32x4_t v_zero = vdupq_n_s32 ( 0 ) ;
CV_DECL_ALIGNED ( 16 ) int buf [ 4 ] ;
int j = 0 ;
for ( ; j < = blockSize - 16 ; j + = 16 )
while ( i < len0 )
{
int8x16_t v_src1 = vld1q_s8 ( src1 + j ) , v_src2 = vld1q_s8 ( src2 + j ) ;
blockSize = std : : min ( len0 - i , blockSize0 ) ;
int32x4_t v_sum = v_zero ;
int16x8_t v_src10 = vmovl_s8 ( vget_low_s8 ( v_src1 ) ) , v_src20 = vmovl_s8 ( vget_low_s8 ( v_src2 ) ) ;
v_sum = vmlal_s16 ( v_sum , vget_low_s16 ( v_src10 ) , vget_low_s16 ( v_src20 ) ) ;
v_sum = vmlal_s16 ( v_sum , vget_high_s16 ( v_src10 ) , vget_high_s16 ( v_src20 ) ) ;
int j = 0 ;
for ( ; j < = blockSize - 16 ; j + = 16 )
{
int8x16_t v_src1 = vld1q_s8 ( src1 + j ) , v_src2 = vld1q_s8 ( src2 + j ) ;
v_src10 = vmovl_s8 ( vget_high_s8 ( v_src1 ) ) ;
v_src20 = vmovl_s8 ( vget_high_s8 ( v_src2 ) ) ;
v_sum = vmlal_s16 ( v_sum , vget_low_s16 ( v_src10 ) , vget_low_s16 ( v_src20 ) ) ;
v_sum = vmlal_s16 ( v_sum , vget_high_s16 ( v_src10 ) , vget_high_s16 ( v_src20 ) ) ;
}
int16x8_t v_src10 = vmovl_s8 ( vget_low_s8 ( v_src1 ) ) , v_src20 = vmovl_s8 ( vget_low_s8 ( v_src2 ) ) ;
v_sum = vmlal_s16 ( v_sum , vget_low_s16 ( v_src10 ) , vget_low_s16 ( v_src20 ) ) ;
v_sum = vmlal_s16 ( v_sum , vget_high_s16 ( v_src10 ) , vget_high_s16 ( v_src20 ) ) ;
for ( ; j < = blockSize - 8 ; j + = 8 )
{
int16x8_t v_src1 = vmovl_s8 ( vld1_s8 ( src1 + j ) ) , v_src2 = vmovl_s8 ( vld1_s8 ( src2 + j ) ) ;
v_sum = vmlal_s16 ( v_sum , vget_low_s16 ( v_src1 ) , vget_low_s16 ( v_src2 ) ) ;
v_sum = vmlal_s16 ( v_sum , vget_high_s16 ( v_src1 ) , vget_high_s16 ( v_src2 ) ) ;
}
v_src10 = vmovl_s8 ( vget_high_s8 ( v_src1 ) ) ;
v_src20 = vmovl_s8 ( vget_high_s8 ( v_src2 ) ) ;
v_sum = vmlal_s16 ( v_sum , vget_low_s16 ( v_src10 ) , vget_low_s16 ( v_src20 ) ) ;
v_sum = vmlal_s16 ( v_sum , vget_high_s16 ( v_src10 ) , vget_high_s16 ( v_src20 ) ) ;
}
for ( ; j < = blockSize - 8 ; j + = 8 )
{
int16x8_t v_src1 = vmovl_s8 ( vld1_s8 ( src1 + j ) ) , v_src2 = vmovl_s8 ( vld1_s8 ( src2 + j ) ) ;
v_sum = vmlal_s16 ( v_sum , vget_low_s16 ( v_src1 ) , vget_low_s16 ( v_src2 ) ) ;
v_sum = vmlal_s16 ( v_sum , vget_high_s16 ( v_src1 ) , vget_high_s16 ( v_src2 ) ) ;
}
vst1q_s32 ( buf , v_sum ) ;
r + = buf [ 0 ] + buf [ 1 ] + buf [ 2 ] + buf [ 3 ] ;
vst1q_s32 ( buf , v_sum ) ;
r + = buf [ 0 ] + buf [ 1 ] + buf [ 2 ] + buf [ 3 ] ;
src1 + = blockSize ;
src2 + = blockSize ;
i + = blockSize ;
src1 + = blockSize ;
src2 + = blockSize ;
i + = blockSize ;
}
}
# endif
@ -3322,26 +3233,27 @@ static double dotProd_32f(const float* src1, const float* src2, int len)
# endif
int i = 0 ;
# if CV_NEON
int len0 = len & - 4 , blockSize0 = ( 1 < < 13 ) , blockSize ;
float32x4_t v_zero = vdupq_n_f32 ( 0.0f ) ;
CV_DECL_ALIGNED ( 16 ) float buf [ 4 ] ;
while ( i < len0 )
# if CV_SIMD128
if ( hasSIMD128 ( ) )
{
blockSize = std : : min ( len0 - i , blockSize0 ) ;
float32x4_t v_sum = v_zero ;
int len0 = len & - 4 , blockSize0 = ( 1 < < 13 ) , blockSize ;
int j = 0 ;
for ( ; j < = blockSize - 4 ; j + = 4 )
v_sum = vmlaq_f32 ( v_sum , vld1q_f32 ( src1 + j ) , vld1q_f32 ( src2 + j ) ) ;
while ( i < len0 )
{
blockSize = std : : min ( len0 - i , blockSize0 ) ;
v_float32x4 v_sum = v_setzero_f32 ( ) ;
int j = 0 ;
int cWidth = v_float32x4 : : nlanes ;
for ( ; j < = blockSize - cWidth ; j + = cWidth )
v_sum = v_muladd ( v_load ( src1 + j ) , v_load ( src2 + j ) , v_sum ) ;
vst1q_f32 ( buf , v_sum ) ;
r + = buf [ 0 ] + buf [ 1 ] + buf [ 2 ] + buf [ 3 ] ;
r + = v_reduce_sum ( v_sum ) ;
src1 + = blockSize ;
src2 + = blockSize ;
i + = blockSize ;
src1 + = blockSize ;
src2 + = blockSize ;
i + = blockSize ;
}
}
# endif
return r + dotProd_ ( src1 , src2 , len - i ) ;