|
|
|
@ -1451,115 +1451,82 @@ transform_( const T* src, T* dst, const WT* m, int len, int scn, int dcn ) |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
#if CV_SIMD128 && !defined(__aarch64__) |
|
|
|
|
static inline void |
|
|
|
|
load3x3Matrix(const float* m, v_float32x4& m0, v_float32x4& m1, v_float32x4& m2, v_float32x4& m3) |
|
|
|
|
{ |
|
|
|
|
m0 = v_float32x4(m[0], m[4], m[8], 0); |
|
|
|
|
m1 = v_float32x4(m[1], m[5], m[9], 0); |
|
|
|
|
m2 = v_float32x4(m[2], m[6], m[10], 0); |
|
|
|
|
m3 = v_float32x4(m[3], m[7], m[11], 0); |
|
|
|
|
} |
|
|
|
|
#endif |
|
|
|
|
|
|
|
|
|
#if CV_SIMD128 |
|
|
|
|
static inline v_int16x8 |
|
|
|
|
v_matmulvec(const v_int16x8 &v0, const v_int16x8 &m0, const v_int16x8 &m1, const v_int16x8 &m2, const v_int32x4 &m3, const int BITS) |
|
|
|
|
{ |
|
|
|
|
// v0 : 0 b0 g0 r0 b1 g1 r1 ?
|
|
|
|
|
v_int32x4 t0 = v_dotprod(v0, m0); // a0 b0 a1 b1
|
|
|
|
|
v_int32x4 t1 = v_dotprod(v0, m1); // c0 d0 c1 d1
|
|
|
|
|
v_int32x4 t2 = v_dotprod(v0, m2); // e0 f0 e1 f1
|
|
|
|
|
v_int32x4 t3 = v_setzero_s32(); |
|
|
|
|
v_int32x4 s0, s1, s2, s3; |
|
|
|
|
v_transpose4x4(t0, t1, t2, t3, s0, s1, s2, s3); |
|
|
|
|
s0 = s0 + s1 + m3; // B0 G0 R0 ?
|
|
|
|
|
s2 = s2 + s3 + m3; // B1 G1 R1 ?
|
|
|
|
|
|
|
|
|
|
s0 = s0 >> BITS; |
|
|
|
|
s2 = s2 >> BITS; |
|
|
|
|
|
|
|
|
|
v_int16x8 result = v_pack(s0, v_setzero_s32()); // B0 G0 R0 0 0 0 0 0
|
|
|
|
|
result = v_reinterpret_as_s16(v_reinterpret_as_s64(result) << 16); // 0 B0 G0 R0 0 0 0 0
|
|
|
|
|
result = result | v_pack(v_setzero_s32(), s2); // 0 B0 G0 R0 B1 G1 R1 0
|
|
|
|
|
return result; |
|
|
|
|
} |
|
|
|
|
#endif |
|
|
|
|
|
|
|
|
|
static void |
|
|
|
|
transform_8u( const uchar* src, uchar* dst, const float* m, int len, int scn, int dcn ) |
|
|
|
|
{ |
|
|
|
|
#if CV_SIMD128 |
|
|
|
|
#if CV_SIMD |
|
|
|
|
const int BITS = 10, SCALE = 1 << BITS; |
|
|
|
|
const float MAX_M = (float)(1 << (15 - BITS)); |
|
|
|
|
|
|
|
|
|
if( hasSIMD128() && scn == 3 && dcn == 3 && |
|
|
|
|
std::abs(m[0]) < MAX_M && std::abs(m[1]) < MAX_M && std::abs(m[2]) < MAX_M && std::abs(m[3]) < MAX_M*256 && |
|
|
|
|
std::abs(m[4]) < MAX_M && std::abs(m[5]) < MAX_M && std::abs(m[6]) < MAX_M && std::abs(m[7]) < MAX_M*256 && |
|
|
|
|
std::abs(m[8]) < MAX_M && std::abs(m[9]) < MAX_M && std::abs(m[10]) < MAX_M && std::abs(m[11]) < MAX_M*256 ) |
|
|
|
|
if( scn == 3 && dcn == 3 && |
|
|
|
|
std::abs(m[0]) < MAX_M && std::abs(m[1]) < MAX_M && std::abs(m[ 2]) < MAX_M*256 && std::abs(m[ 3]) < MAX_M*256 && |
|
|
|
|
std::abs(m[4]) < MAX_M && std::abs(m[5]) < MAX_M && std::abs(m[ 6]) < MAX_M*256 && std::abs(m[ 7]) < MAX_M*256 && |
|
|
|
|
std::abs(m[8]) < MAX_M && std::abs(m[9]) < MAX_M && std::abs(m[10]) < MAX_M*256 && std::abs(m[11]) < MAX_M*256 ) |
|
|
|
|
{ |
|
|
|
|
const int nChannels = 3; |
|
|
|
|
const int cWidth = v_int16x8::nlanes; |
|
|
|
|
// faster fixed-point transformation
|
|
|
|
|
short m00 = saturate_cast<short>(m[0]*SCALE), m01 = saturate_cast<short>(m[1]*SCALE), |
|
|
|
|
m02 = saturate_cast<short>(m[2]*SCALE), m10 = saturate_cast<short>(m[4]*SCALE), |
|
|
|
|
m11 = saturate_cast<short>(m[5]*SCALE), m12 = saturate_cast<short>(m[6]*SCALE), |
|
|
|
|
m20 = saturate_cast<short>(m[8]*SCALE), m21 = saturate_cast<short>(m[9]*SCALE), |
|
|
|
|
m22 = saturate_cast<short>(m[10]*SCALE); |
|
|
|
|
int m03 = saturate_cast<int>((m[3]+0.5f)*SCALE), m13 = saturate_cast<int>((m[7]+0.5f)*SCALE ), |
|
|
|
|
m23 = saturate_cast<int>((m[11]+0.5f)*SCALE); |
|
|
|
|
|
|
|
|
|
v_int16x8 m0 = v_int16x8(0, m00, m01, m02, m00, m01, m02, 0); |
|
|
|
|
v_int16x8 m1 = v_int16x8(0, m10, m11, m12, m10, m11, m12, 0); |
|
|
|
|
v_int16x8 m2 = v_int16x8(0, m20, m21, m22, m20, m21, m22, 0); |
|
|
|
|
v_int32x4 m3 = v_int32x4(m03, m13, m23, 0); |
|
|
|
|
int x = 0; |
|
|
|
|
|
|
|
|
|
for (; x <= (len - cWidth) * nChannels; x += cWidth * nChannels) |
|
|
|
|
union { |
|
|
|
|
short s[6]; |
|
|
|
|
int p[3]; |
|
|
|
|
} m16; |
|
|
|
|
m16.s[0] = saturate_cast<short>(m[0] * SCALE); m16.s[1] = saturate_cast<short>(m[1] * SCALE); |
|
|
|
|
m16.s[2] = saturate_cast<short>(m[4] * SCALE); m16.s[3] = saturate_cast<short>(m[5] * SCALE); |
|
|
|
|
m16.s[4] = saturate_cast<short>(m[8] * SCALE); m16.s[5] = saturate_cast<short>(m[9] * SCALE); |
|
|
|
|
int m32[] = {saturate_cast<int>(m[ 2] * SCALE), saturate_cast<int>(m[ 3] * SCALE), |
|
|
|
|
saturate_cast<int>(m[ 6] * SCALE), saturate_cast<int>(m[ 7] * SCALE), |
|
|
|
|
saturate_cast<int>(m[10] * SCALE), saturate_cast<int>(m[11] * SCALE)}; |
|
|
|
|
v_int16 m01 = v_reinterpret_as_s16(vx_setall_s32(m16.p[0])); |
|
|
|
|
v_int32 m2 = vx_setall_s32(m32[0]); |
|
|
|
|
v_int32 m3 = vx_setall_s32(m32[1]); |
|
|
|
|
v_int16 m45 = v_reinterpret_as_s16(vx_setall_s32(m16.p[1])); |
|
|
|
|
v_int32 m6 = vx_setall_s32(m32[2]); |
|
|
|
|
v_int32 m7 = vx_setall_s32(m32[3]); |
|
|
|
|
v_int16 m89 = v_reinterpret_as_s16(vx_setall_s32(m16.p[2])); |
|
|
|
|
v_int32 m10 = vx_setall_s32(m32[4]); |
|
|
|
|
v_int32 m11 = vx_setall_s32(m32[5]); |
|
|
|
|
int x = 0; |
|
|
|
|
for (; x <= (len - v_uint8::nlanes) * nChannels; x += v_uint8::nlanes * nChannels) |
|
|
|
|
{ |
|
|
|
|
// load 8 pixels
|
|
|
|
|
v_int16x8 v0 = v_reinterpret_as_s16(v_load_expand(src + x)); |
|
|
|
|
v_int16x8 v1 = v_reinterpret_as_s16(v_load_expand(src + x + cWidth)); |
|
|
|
|
v_int16x8 v2 = v_reinterpret_as_s16(v_load_expand(src + x + cWidth * 2)); |
|
|
|
|
v_int16x8 v3; |
|
|
|
|
|
|
|
|
|
// rotate and pack
|
|
|
|
|
v3 = v_rotate_right<1>(v2); // 0 b6 g6 r6 b7 g7 r7 0
|
|
|
|
|
v2 = v_rotate_left <5>(v2, v1); // 0 b4 g4 r4 b5 g5 r5 0
|
|
|
|
|
v1 = v_rotate_left <3>(v1, v0); // 0 b2 g2 r2 b3 g3 r3 0
|
|
|
|
|
v0 = v_rotate_left <1>(v0); // 0 b0 g0 r0 b1 g1 r1 0
|
|
|
|
|
|
|
|
|
|
// multiply with matrix and normalize
|
|
|
|
|
v0 = v_matmulvec(v0, m0, m1, m2, m3, BITS); // 0 B0 G0 R0 B1 G1 R1 0
|
|
|
|
|
v1 = v_matmulvec(v1, m0, m1, m2, m3, BITS); // 0 B2 G2 R2 B3 G3 R3 0
|
|
|
|
|
v2 = v_matmulvec(v2, m0, m1, m2, m3, BITS); // 0 B4 G4 R4 B5 G5 R5 0
|
|
|
|
|
v3 = v_matmulvec(v3, m0, m1, m2, m3, BITS); // 0 B6 G6 R6 B7 G7 R7 0
|
|
|
|
|
|
|
|
|
|
// narrow down as uint8x16
|
|
|
|
|
v_uint8x16 z0 = v_pack_u(v0, v_setzero_s16()); // 0 B0 G0 R0 B1 G1 R1 0 0 0 0 0 0 0 0 0
|
|
|
|
|
v_uint8x16 z1 = v_pack_u(v1, v_setzero_s16()); // 0 B2 G2 R2 B3 G3 R3 0 0 0 0 0 0 0 0 0
|
|
|
|
|
v_uint8x16 z2 = v_pack_u(v2, v_setzero_s16()); // 0 B4 G4 R4 B5 G5 R5 0 0 0 0 0 0 0 0 0
|
|
|
|
|
v_uint8x16 z3 = v_pack_u(v3, v_setzero_s16()); // 0 B6 G6 R6 B7 G7 R7 0 0 0 0 0 0 0 0 0
|
|
|
|
|
|
|
|
|
|
// rotate and pack
|
|
|
|
|
z0 = v_reinterpret_as_u8(v_reinterpret_as_u64(z0) >> 8) | v_reinterpret_as_u8(v_reinterpret_as_u64(z1) << 40); // B0 G0 R0 B1 G1 R1 B2 G2 0 0 0 0 0 0 0 0
|
|
|
|
|
z1 = v_reinterpret_as_u8(v_reinterpret_as_u64(z1) >> 24) | v_reinterpret_as_u8(v_reinterpret_as_u64(z2) << 24); // R2 B3 G3 R3 B4 G4 R4 B5 0 0 0 0 0 0 0 0
|
|
|
|
|
z2 = v_reinterpret_as_u8(v_reinterpret_as_u64(z2) >> 40) | v_reinterpret_as_u8(v_reinterpret_as_u64(z3) << 8); // G5 R6 B6 G6 R6 B7 G7 R7 0 0 0 0 0 0 0 0
|
|
|
|
|
|
|
|
|
|
// store on memory
|
|
|
|
|
v_store_low(dst + x, z0); |
|
|
|
|
v_store_low(dst + x + cWidth, z1); |
|
|
|
|
v_store_low(dst + x + cWidth * 2, z2); |
|
|
|
|
v_uint8 b, g, r; |
|
|
|
|
v_load_deinterleave(src + x, b, g, r); |
|
|
|
|
v_uint8 bgl, bgh; |
|
|
|
|
v_zip(b, g, bgl, bgh); |
|
|
|
|
v_uint16 rl, rh; |
|
|
|
|
v_expand(r, rl, rh); |
|
|
|
|
|
|
|
|
|
v_int16 dbl, dbh, dgl, dgh, drl, drh; |
|
|
|
|
v_uint16 p0, p2; |
|
|
|
|
v_int32 p1, p3; |
|
|
|
|
v_expand(bgl, p0, p2); |
|
|
|
|
v_expand(v_reinterpret_as_s16(rl), p1, p3); |
|
|
|
|
dbl = v_rshr_pack<BITS>(v_dotprod(v_reinterpret_as_s16(p0), m01) + p1 * m2 + m3, |
|
|
|
|
v_dotprod(v_reinterpret_as_s16(p2), m01) + p3 * m2 + m3); |
|
|
|
|
dgl = v_rshr_pack<BITS>(v_dotprod(v_reinterpret_as_s16(p0), m45) + p1 * m6 + m7, |
|
|
|
|
v_dotprod(v_reinterpret_as_s16(p2), m45) + p3 * m6 + m7); |
|
|
|
|
drl = v_rshr_pack<BITS>(v_dotprod(v_reinterpret_as_s16(p0), m89) + p1 * m10 + m11, |
|
|
|
|
v_dotprod(v_reinterpret_as_s16(p2), m89) + p3 * m10 + m11); |
|
|
|
|
v_expand(bgh, p0, p2); |
|
|
|
|
v_expand(v_reinterpret_as_s16(rh), p1, p3); |
|
|
|
|
dbh = v_rshr_pack<BITS>(v_dotprod(v_reinterpret_as_s16(p0), m01) + p1 * m2 + m3, |
|
|
|
|
v_dotprod(v_reinterpret_as_s16(p2), m01) + p3 * m2 + m3); |
|
|
|
|
dgh = v_rshr_pack<BITS>(v_dotprod(v_reinterpret_as_s16(p0), m45) + p1 * m6 + m7, |
|
|
|
|
v_dotprod(v_reinterpret_as_s16(p2), m45) + p3 * m6 + m7); |
|
|
|
|
drh = v_rshr_pack<BITS>(v_dotprod(v_reinterpret_as_s16(p0), m89) + p1 * m10 + m11, |
|
|
|
|
v_dotprod(v_reinterpret_as_s16(p2), m89) + p3 * m10 + m11); |
|
|
|
|
v_store_interleave(dst + x, v_pack_u(dbl, dbh), v_pack_u(dgl, dgh), v_pack_u(drl, drh)); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
m32[1] = saturate_cast<int>((m[3] + 0.5f)*SCALE); |
|
|
|
|
m32[3] = saturate_cast<int>((m[7] + 0.5f)*SCALE); |
|
|
|
|
m32[5] = saturate_cast<int>((m[11] + 0.5f)*SCALE); |
|
|
|
|
for( ; x < len * nChannels; x += nChannels ) |
|
|
|
|
{ |
|
|
|
|
int v0 = src[x], v1 = src[x+1], v2 = src[x+2]; |
|
|
|
|
uchar t0 = saturate_cast<uchar>((m00*v0 + m01*v1 + m02*v2 + m03)>>BITS); |
|
|
|
|
uchar t1 = saturate_cast<uchar>((m10*v0 + m11*v1 + m12*v2 + m13)>>BITS); |
|
|
|
|
uchar t2 = saturate_cast<uchar>((m20*v0 + m21*v1 + m22*v2 + m23)>>BITS); |
|
|
|
|
uchar t0 = saturate_cast<uchar>((m16.s[0] * v0 + m16.s[1] * v1 + m32[0] * v2 + m32[1]) >> BITS); |
|
|
|
|
uchar t1 = saturate_cast<uchar>((m16.s[2] * v0 + m16.s[3] * v1 + m32[2] * v2 + m32[3]) >> BITS); |
|
|
|
|
uchar t2 = saturate_cast<uchar>((m16.s[4] * v0 + m16.s[5] * v1 + m32[4] * v2 + m32[5]) >> BITS); |
|
|
|
|
dst[x] = t0; dst[x+1] = t1; dst[x+2] = t2; |
|
|
|
|
} |
|
|
|
|
vx_cleanup(); |
|
|
|
|
return; |
|
|
|
|
} |
|
|
|
|
#endif |
|
|
|
@ -1570,64 +1537,65 @@ transform_8u( const uchar* src, uchar* dst, const float* m, int len, int scn, in |
|
|
|
|
static void |
|
|
|
|
transform_16u( const ushort* src, ushort* dst, const float* m, int len, int scn, int dcn ) |
|
|
|
|
{ |
|
|
|
|
#if CV_SIMD128 && !defined(__aarch64__) |
|
|
|
|
if( hasSIMD128() && scn == 3 && dcn == 3 ) |
|
|
|
|
#if CV_SIMD && !defined(__aarch64__) |
|
|
|
|
if( scn == 3 && dcn == 3 ) |
|
|
|
|
{ |
|
|
|
|
const int nChannels = 3; |
|
|
|
|
const int cWidth = v_float32x4::nlanes; |
|
|
|
|
v_int16x8 delta = v_int16x8(0, -32768, -32768, -32768, -32768, -32768, -32768, 0); |
|
|
|
|
v_float32x4 m0, m1, m2, m3; |
|
|
|
|
load3x3Matrix(m, m0, m1, m2, m3); |
|
|
|
|
m3 -= v_float32x4(32768.f, 32768.f, 32768.f, 0.f); |
|
|
|
|
|
|
|
|
|
int x = 0; |
|
|
|
|
for( ; x <= (len - cWidth) * nChannels; x += cWidth * nChannels ) |
|
|
|
|
#if CV_SIMD_WIDTH > 16 |
|
|
|
|
v_float32 m0 = vx_setall_f32(m[ 0]); |
|
|
|
|
v_float32 m1 = vx_setall_f32(m[ 1]); |
|
|
|
|
v_float32 m2 = vx_setall_f32(m[ 2]); |
|
|
|
|
v_float32 m3 = vx_setall_f32(m[ 3] - 32768.f); |
|
|
|
|
v_float32 m4 = vx_setall_f32(m[ 4]); |
|
|
|
|
v_float32 m5 = vx_setall_f32(m[ 5]); |
|
|
|
|
v_float32 m6 = vx_setall_f32(m[ 6]); |
|
|
|
|
v_float32 m7 = vx_setall_f32(m[ 7] - 32768.f); |
|
|
|
|
v_float32 m8 = vx_setall_f32(m[ 8]); |
|
|
|
|
v_float32 m9 = vx_setall_f32(m[ 9]); |
|
|
|
|
v_float32 m10 = vx_setall_f32(m[10]); |
|
|
|
|
v_float32 m11 = vx_setall_f32(m[11] - 32768.f); |
|
|
|
|
v_int16 delta = vx_setall_s16(-32768); |
|
|
|
|
for (; x <= (len - v_uint16::nlanes)*3; x += v_uint16::nlanes*3) |
|
|
|
|
{ |
|
|
|
|
// load 4 pixels
|
|
|
|
|
v_uint16x8 v0_16 = v_load(src + x); // b0 g0 r0 b1 g1 r1 b2 g2
|
|
|
|
|
v_uint16x8 v2_16 = v_load_low(src + x + cWidth * 2); // r2 b3 g3 r3 ? ? ? ?
|
|
|
|
|
|
|
|
|
|
// expand to 4 vectors
|
|
|
|
|
v_uint32x4 v0_32, v1_32, v2_32, v3_32, dummy_32; |
|
|
|
|
v_expand(v_rotate_right<3>(v0_16), v1_32, dummy_32); // b1 g1 r1
|
|
|
|
|
v_expand(v_rotate_right<1>(v2_16), v3_32, dummy_32); // b3 g3 r3
|
|
|
|
|
v_expand(v_rotate_right<6>(v0_16, v2_16), v2_32, dummy_32); // b2 g2 r2
|
|
|
|
|
v_expand(v0_16, v0_32, dummy_32); // b0 g0 r0
|
|
|
|
|
|
|
|
|
|
// convert to float32x4
|
|
|
|
|
v_float32x4 x0 = v_cvt_f32(v_reinterpret_as_s32(v0_32)); // b0 g0 r0
|
|
|
|
|
v_float32x4 x1 = v_cvt_f32(v_reinterpret_as_s32(v1_32)); // b1 g1 r1
|
|
|
|
|
v_float32x4 x2 = v_cvt_f32(v_reinterpret_as_s32(v2_32)); // b2 g2 r2
|
|
|
|
|
v_float32x4 x3 = v_cvt_f32(v_reinterpret_as_s32(v3_32)); // b3 g3 r3
|
|
|
|
|
|
|
|
|
|
// multiply and convert back to int32x4
|
|
|
|
|
v_int32x4 y0, y1, y2, y3; |
|
|
|
|
y0 = v_round(v_matmuladd(x0, m0, m1, m2, m3)); // B0 G0 R0
|
|
|
|
|
y1 = v_round(v_matmuladd(x1, m0, m1, m2, m3)); // B1 G1 R1
|
|
|
|
|
y2 = v_round(v_matmuladd(x2, m0, m1, m2, m3)); // B2 G2 R2
|
|
|
|
|
y3 = v_round(v_matmuladd(x3, m0, m1, m2, m3)); // B3 G3 R3
|
|
|
|
|
|
|
|
|
|
// narrow down to int16x8
|
|
|
|
|
v_int16x8 v0 = v_add_wrap(v_pack(v_rotate_left<1>(y0), y1), delta); // 0 B0 G0 R0 B1 G1 R1 0
|
|
|
|
|
v_int16x8 v2 = v_add_wrap(v_pack(v_rotate_left<1>(y2), y3), delta); // 0 B2 G2 R2 B3 G3 R3 0
|
|
|
|
|
|
|
|
|
|
// rotate and pack
|
|
|
|
|
v0 = v_rotate_right<1>(v0) | v_rotate_left<5>(v2); // B0 G0 R0 B1 G1 R1 B2 G2
|
|
|
|
|
v2 = v_rotate_right<3>(v2); // R2 B3 G3 R3 0 0 0 0
|
|
|
|
|
|
|
|
|
|
// store 4 pixels
|
|
|
|
|
v_store(dst + x, v_reinterpret_as_u16(v0)); |
|
|
|
|
v_store_low(dst + x + cWidth * 2, v_reinterpret_as_u16(v2)); |
|
|
|
|
v_uint16 b, g, r; |
|
|
|
|
v_load_deinterleave(src + x, b, g, r); |
|
|
|
|
v_uint32 bl, bh, gl, gh, rl, rh; |
|
|
|
|
v_expand(b, bl, bh); |
|
|
|
|
v_expand(g, gl, gh); |
|
|
|
|
v_expand(r, rl, rh); |
|
|
|
|
|
|
|
|
|
v_int16 db, dg, dr; |
|
|
|
|
db = v_add_wrap(v_pack(v_round(v_muladd(v_cvt_f32(v_reinterpret_as_s32(bl)), m0, v_muladd(v_cvt_f32(v_reinterpret_as_s32(gl)), m1, v_muladd(v_cvt_f32(v_reinterpret_as_s32(rl)), m2, m3)))), |
|
|
|
|
v_round(v_muladd(v_cvt_f32(v_reinterpret_as_s32(bh)), m0, v_muladd(v_cvt_f32(v_reinterpret_as_s32(gh)), m1, v_muladd(v_cvt_f32(v_reinterpret_as_s32(rh)), m2, m3))))), delta); |
|
|
|
|
dg = v_add_wrap(v_pack(v_round(v_muladd(v_cvt_f32(v_reinterpret_as_s32(bl)), m4, v_muladd(v_cvt_f32(v_reinterpret_as_s32(gl)), m5, v_muladd(v_cvt_f32(v_reinterpret_as_s32(rl)), m6, m7)))), |
|
|
|
|
v_round(v_muladd(v_cvt_f32(v_reinterpret_as_s32(bh)), m4, v_muladd(v_cvt_f32(v_reinterpret_as_s32(gh)), m5, v_muladd(v_cvt_f32(v_reinterpret_as_s32(rh)), m6, m7))))), delta); |
|
|
|
|
dr = v_add_wrap(v_pack(v_round(v_muladd(v_cvt_f32(v_reinterpret_as_s32(bl)), m8, v_muladd(v_cvt_f32(v_reinterpret_as_s32(gl)), m9, v_muladd(v_cvt_f32(v_reinterpret_as_s32(rl)), m10, m11)))), |
|
|
|
|
v_round(v_muladd(v_cvt_f32(v_reinterpret_as_s32(bh)), m8, v_muladd(v_cvt_f32(v_reinterpret_as_s32(gh)), m9, v_muladd(v_cvt_f32(v_reinterpret_as_s32(rh)), m10, m11))))), delta); |
|
|
|
|
v_store_interleave(dst + x, v_reinterpret_as_u16(db), v_reinterpret_as_u16(dg), v_reinterpret_as_u16(dr)); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
for( ; x < len * nChannels; x += nChannels ) |
|
|
|
|
#endif |
|
|
|
|
v_float32x4 _m0l(m[0], m[4], m[ 8], 0.f); |
|
|
|
|
v_float32x4 _m1l(m[1], m[5], m[ 9], 0.f); |
|
|
|
|
v_float32x4 _m2l(m[2], m[6], m[10], 0.f); |
|
|
|
|
v_float32x4 _m3l(m[3] - 32768.f, m[7] - 32768.f, m[11] - 32768.f, 0.f); |
|
|
|
|
v_float32x4 _m0h = v_rotate_left<1>(_m0l); |
|
|
|
|
v_float32x4 _m1h = v_rotate_left<1>(_m1l); |
|
|
|
|
v_float32x4 _m2h = v_rotate_left<1>(_m2l); |
|
|
|
|
v_float32x4 _m3h = v_rotate_left<1>(_m3l); |
|
|
|
|
v_int16x8 _delta(0, -32768, -32768, -32768, -32768, -32768, -32768, 0); |
|
|
|
|
for( ; x <= len*3 - v_uint16x8::nlanes; x += 3*v_uint16x8::nlanes/4 ) |
|
|
|
|
v_store(dst + x, v_rotate_right<1>(v_reinterpret_as_u16(v_add_wrap(v_pack( |
|
|
|
|
v_round(v_matmuladd(v_cvt_f32(v_reinterpret_as_s32(v_load_expand(src + x ))), _m0h, _m1h, _m2h, _m3h)), |
|
|
|
|
v_round(v_matmuladd(v_cvt_f32(v_reinterpret_as_s32(v_load_expand(src + x + 3))), _m0l, _m1l, _m2l, _m3l))), _delta)))); |
|
|
|
|
for( ; x < len * 3; x += 3 ) |
|
|
|
|
{ |
|
|
|
|
float v0 = src[x], v1 = src[x + 1], v2 = src[x + 2]; |
|
|
|
|
ushort t0 = saturate_cast<ushort>(m[0] * v0 + m[1] * v1 + m[2] * v2 + m[3]); |
|
|
|
|
ushort t1 = saturate_cast<ushort>(m[4] * v0 + m[5] * v1 + m[6] * v2 + m[7]); |
|
|
|
|
ushort t0 = saturate_cast<ushort>(m[0] * v0 + m[1] * v1 + m[ 2] * v2 + m[ 3]); |
|
|
|
|
ushort t1 = saturate_cast<ushort>(m[4] * v0 + m[5] * v1 + m[ 6] * v2 + m[ 7]); |
|
|
|
|
ushort t2 = saturate_cast<ushort>(m[8] * v0 + m[9] * v1 + m[10] * v2 + m[11]); |
|
|
|
|
dst[x] = t0; dst[x + 1] = t1; dst[x + 2] = t2; |
|
|
|
|
} |
|
|
|
|
vx_cleanup(); |
|
|
|
|
return; |
|
|
|
|
} |
|
|
|
|
#endif |
|
|
|
@ -1638,52 +1606,68 @@ transform_16u( const ushort* src, ushort* dst, const float* m, int len, int scn, |
|
|
|
|
static void |
|
|
|
|
transform_32f( const float* src, float* dst, const float* m, int len, int scn, int dcn ) |
|
|
|
|
{ |
|
|
|
|
#if CV_SIMD128 && !defined(__aarch64__) |
|
|
|
|
if( hasSIMD128() ) |
|
|
|
|
#if CV_SIMD && !defined(__aarch64__) |
|
|
|
|
int x = 0; |
|
|
|
|
if( scn == 3 && dcn == 3 ) |
|
|
|
|
{ |
|
|
|
|
int x = 0; |
|
|
|
|
if( scn == 3 && dcn == 3 ) |
|
|
|
|
int idx[v_float32::nlanes/2]; |
|
|
|
|
for( int i = 0; i < v_float32::nlanes/4; i++ ) |
|
|
|
|
{ |
|
|
|
|
const int cWidth = 3; |
|
|
|
|
v_float32x4 m0, m1, m2, m3; |
|
|
|
|
load3x3Matrix(m, m0, m1, m2, m3); |
|
|
|
|
|
|
|
|
|
for( ; x < (len - 1)*cWidth; x += cWidth ) |
|
|
|
|
{ |
|
|
|
|
v_float32x4 x0 = v_load(src + x); |
|
|
|
|
v_float32x4 y0 = v_matmuladd(x0, m0, m1, m2, m3); |
|
|
|
|
v_store_low(dst + x, y0); |
|
|
|
|
dst[x + 2] = v_combine_high(y0, y0).get0(); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
for( ; x < len*cWidth; x += cWidth ) |
|
|
|
|
{ |
|
|
|
|
float v0 = src[x], v1 = src[x+1], v2 = src[x+2]; |
|
|
|
|
float t0 = saturate_cast<float>(m[0]*v0 + m[1]*v1 + m[2]*v2 + m[3]); |
|
|
|
|
float t1 = saturate_cast<float>(m[4]*v0 + m[5]*v1 + m[6]*v2 + m[7]); |
|
|
|
|
float t2 = saturate_cast<float>(m[8]*v0 + m[9]*v1 + m[10]*v2 + m[11]); |
|
|
|
|
dst[x] = t0; dst[x+1] = t1; dst[x+2] = t2; |
|
|
|
|
} |
|
|
|
|
return; |
|
|
|
|
idx[i] = 3*i; |
|
|
|
|
idx[i + v_float32::nlanes/4] = 0; |
|
|
|
|
} |
|
|
|
|
float _m[] = { m[0], m[4], m[ 8], 0.f, |
|
|
|
|
m[1], m[5], m[ 9], 0.f, |
|
|
|
|
m[2], m[6], m[10], 0.f, |
|
|
|
|
m[3], m[7], m[11], 0.f }; |
|
|
|
|
v_float32 m0 = vx_lut_quads(_m , idx + v_float32::nlanes/4); |
|
|
|
|
v_float32 m1 = vx_lut_quads(_m + 4, idx + v_float32::nlanes/4); |
|
|
|
|
v_float32 m2 = vx_lut_quads(_m + 8, idx + v_float32::nlanes/4); |
|
|
|
|
v_float32 m3 = vx_lut_quads(_m + 12, idx + v_float32::nlanes/4); |
|
|
|
|
for( ; x <= len*3 - v_float32::nlanes; x += 3*v_float32::nlanes/4 ) |
|
|
|
|
v_store(dst + x, v_pack_triplets(v_matmuladd(vx_lut_quads(src + x, idx), m0, m1, m2, m3))); |
|
|
|
|
for( ; x < len*3; x += 3 ) |
|
|
|
|
{ |
|
|
|
|
float v0 = src[x], v1 = src[x+1], v2 = src[x+2]; |
|
|
|
|
float t0 = saturate_cast<float>(m[0]*v0 + m[1]*v1 + m[ 2]*v2 + m[ 3]); |
|
|
|
|
float t1 = saturate_cast<float>(m[4]*v0 + m[5]*v1 + m[ 6]*v2 + m[ 7]); |
|
|
|
|
float t2 = saturate_cast<float>(m[8]*v0 + m[9]*v1 + m[10]*v2 + m[11]); |
|
|
|
|
dst[x] = t0; dst[x+1] = t1; dst[x+2] = t2; |
|
|
|
|
} |
|
|
|
|
vx_cleanup(); |
|
|
|
|
return; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
if( scn == 4 && dcn == 4 ) |
|
|
|
|
if( scn == 4 && dcn == 4 ) |
|
|
|
|
{ |
|
|
|
|
#if CV_SIMD_WIDTH > 16 |
|
|
|
|
int idx[v_float32::nlanes/4]; |
|
|
|
|
for( int i = 0; i < v_float32::nlanes/4; i++ ) |
|
|
|
|
idx[i] = 0; |
|
|
|
|
float _m[] = { m[4], m[9], m[14], m[19] }; |
|
|
|
|
v_float32 m0 = vx_lut_quads(m , idx); |
|
|
|
|
v_float32 m1 = vx_lut_quads(m+ 5, idx); |
|
|
|
|
v_float32 m2 = vx_lut_quads(m+10, idx); |
|
|
|
|
v_float32 m3 = vx_lut_quads(m+15, idx); |
|
|
|
|
v_float32 m4 = vx_lut_quads(_m, idx); |
|
|
|
|
for( ; x <= len*4 - v_float32::nlanes; x += v_float32::nlanes ) |
|
|
|
|
{ |
|
|
|
|
const int cWidth = 4; |
|
|
|
|
v_float32x4 m0 = v_float32x4(m[0], m[5], m[10], m[15]); |
|
|
|
|
v_float32x4 m1 = v_float32x4(m[1], m[6], m[11], m[16]); |
|
|
|
|
v_float32x4 m2 = v_float32x4(m[2], m[7], m[12], m[17]); |
|
|
|
|
v_float32x4 m3 = v_float32x4(m[3], m[8], m[13], m[18]); |
|
|
|
|
v_float32x4 m4 = v_float32x4(m[4], m[9], m[14], m[19]); |
|
|
|
|
|
|
|
|
|
for( ; x < len*cWidth; x += cWidth ) |
|
|
|
|
{ |
|
|
|
|
v_float32x4 x0 = v_load(src + x); |
|
|
|
|
v_float32x4 y0 = v_matmul(x0, m0, m1, m2, m3) + m4; |
|
|
|
|
v_store(dst + x, y0); |
|
|
|
|
} |
|
|
|
|
return; |
|
|
|
|
v_float32 v_src = vx_load(src + x); |
|
|
|
|
v_store(dst + x, v_reduce_sum4(v_src * m0, v_src * m1, v_src * m2, v_src * m3) + m4); |
|
|
|
|
} |
|
|
|
|
#endif |
|
|
|
|
v_float32x4 _m0 = v_load(m ); |
|
|
|
|
v_float32x4 _m1 = v_load(m + 5); |
|
|
|
|
v_float32x4 _m2 = v_load(m + 10); |
|
|
|
|
v_float32x4 _m3 = v_load(m + 15); |
|
|
|
|
v_float32x4 _m4(m[4], m[9], m[14], m[19]); |
|
|
|
|
for( ; x < len*4; x += v_float32x4::nlanes ) |
|
|
|
|
{ |
|
|
|
|
v_float32x4 v_src = v_load(src + x); |
|
|
|
|
v_store(dst + x, v_reduce_sum4(v_src * _m0, v_src * _m1, v_src * _m2, v_src * _m3) + _m4); |
|
|
|
|
} |
|
|
|
|
vx_cleanup(); |
|
|
|
|
return; |
|
|
|
|
} |
|
|
|
|
#endif |
|
|
|
|
|
|
|
|
|