diff --git a/modules/core/src/matmul.simd.hpp b/modules/core/src/matmul.simd.hpp index 760bf39e0f..ef54bb037c 100644 --- a/modules/core/src/matmul.simd.hpp +++ b/modules/core/src/matmul.simd.hpp @@ -1451,115 +1451,82 @@ transform_( const T* src, T* dst, const WT* m, int len, int scn, int dcn ) } } -#if CV_SIMD128 && !defined(__aarch64__) -static inline void -load3x3Matrix(const float* m, v_float32x4& m0, v_float32x4& m1, v_float32x4& m2, v_float32x4& m3) -{ - m0 = v_float32x4(m[0], m[4], m[8], 0); - m1 = v_float32x4(m[1], m[5], m[9], 0); - m2 = v_float32x4(m[2], m[6], m[10], 0); - m3 = v_float32x4(m[3], m[7], m[11], 0); -} -#endif - -#if CV_SIMD128 -static inline v_int16x8 -v_matmulvec(const v_int16x8 &v0, const v_int16x8 &m0, const v_int16x8 &m1, const v_int16x8 &m2, const v_int32x4 &m3, const int BITS) -{ - // v0 : 0 b0 g0 r0 b1 g1 r1 ? - v_int32x4 t0 = v_dotprod(v0, m0); // a0 b0 a1 b1 - v_int32x4 t1 = v_dotprod(v0, m1); // c0 d0 c1 d1 - v_int32x4 t2 = v_dotprod(v0, m2); // e0 f0 e1 f1 - v_int32x4 t3 = v_setzero_s32(); - v_int32x4 s0, s1, s2, s3; - v_transpose4x4(t0, t1, t2, t3, s0, s1, s2, s3); - s0 = s0 + s1 + m3; // B0 G0 R0 ? - s2 = s2 + s3 + m3; // B1 G1 R1 ? - - s0 = s0 >> BITS; - s2 = s2 >> BITS; - - v_int16x8 result = v_pack(s0, v_setzero_s32()); // B0 G0 R0 0 0 0 0 0 - result = v_reinterpret_as_s16(v_reinterpret_as_s64(result) << 16); // 0 B0 G0 R0 0 0 0 0 - result = result | v_pack(v_setzero_s32(), s2); // 0 B0 G0 R0 B1 G1 R1 0 - return result; -} -#endif - static void transform_8u( const uchar* src, uchar* dst, const float* m, int len, int scn, int dcn ) { -#if CV_SIMD128 +#if CV_SIMD const int BITS = 10, SCALE = 1 << BITS; const float MAX_M = (float)(1 << (15 - BITS)); - if( hasSIMD128() && scn == 3 && dcn == 3 && - std::abs(m[0]) < MAX_M && std::abs(m[1]) < MAX_M && std::abs(m[2]) < MAX_M && std::abs(m[3]) < MAX_M*256 && - std::abs(m[4]) < MAX_M && std::abs(m[5]) < MAX_M && std::abs(m[6]) < MAX_M && std::abs(m[7]) < MAX_M*256 && - std::abs(m[8]) < MAX_M && std::abs(m[9]) < MAX_M && std::abs(m[10]) < MAX_M && std::abs(m[11]) < MAX_M*256 ) + if( scn == 3 && dcn == 3 && + std::abs(m[0]) < MAX_M && std::abs(m[1]) < MAX_M && std::abs(m[ 2]) < MAX_M*256 && std::abs(m[ 3]) < MAX_M*256 && + std::abs(m[4]) < MAX_M && std::abs(m[5]) < MAX_M && std::abs(m[ 6]) < MAX_M*256 && std::abs(m[ 7]) < MAX_M*256 && + std::abs(m[8]) < MAX_M && std::abs(m[9]) < MAX_M && std::abs(m[10]) < MAX_M*256 && std::abs(m[11]) < MAX_M*256 ) { const int nChannels = 3; - const int cWidth = v_int16x8::nlanes; - // faster fixed-point transformation - short m00 = saturate_cast(m[0]*SCALE), m01 = saturate_cast(m[1]*SCALE), - m02 = saturate_cast(m[2]*SCALE), m10 = saturate_cast(m[4]*SCALE), - m11 = saturate_cast(m[5]*SCALE), m12 = saturate_cast(m[6]*SCALE), - m20 = saturate_cast(m[8]*SCALE), m21 = saturate_cast(m[9]*SCALE), - m22 = saturate_cast(m[10]*SCALE); - int m03 = saturate_cast((m[3]+0.5f)*SCALE), m13 = saturate_cast((m[7]+0.5f)*SCALE ), - m23 = saturate_cast((m[11]+0.5f)*SCALE); - - v_int16x8 m0 = v_int16x8(0, m00, m01, m02, m00, m01, m02, 0); - v_int16x8 m1 = v_int16x8(0, m10, m11, m12, m10, m11, m12, 0); - v_int16x8 m2 = v_int16x8(0, m20, m21, m22, m20, m21, m22, 0); - v_int32x4 m3 = v_int32x4(m03, m13, m23, 0); - int x = 0; - for (; x <= (len - cWidth) * nChannels; x += cWidth * nChannels) + union { + short s[6]; + int p[3]; + } m16; + m16.s[0] = saturate_cast(m[0] * SCALE); m16.s[1] = saturate_cast(m[1] * SCALE); + m16.s[2] = saturate_cast(m[4] * SCALE); m16.s[3] = saturate_cast(m[5] * SCALE); + m16.s[4] = saturate_cast(m[8] * SCALE); m16.s[5] = saturate_cast(m[9] * SCALE); + int m32[] = {saturate_cast(m[ 2] * SCALE), saturate_cast(m[ 3] * SCALE), + saturate_cast(m[ 6] * SCALE), saturate_cast(m[ 7] * SCALE), + saturate_cast(m[10] * SCALE), saturate_cast(m[11] * SCALE)}; + v_int16 m01 = v_reinterpret_as_s16(vx_setall_s32(m16.p[0])); + v_int32 m2 = vx_setall_s32(m32[0]); + v_int32 m3 = vx_setall_s32(m32[1]); + v_int16 m45 = v_reinterpret_as_s16(vx_setall_s32(m16.p[1])); + v_int32 m6 = vx_setall_s32(m32[2]); + v_int32 m7 = vx_setall_s32(m32[3]); + v_int16 m89 = v_reinterpret_as_s16(vx_setall_s32(m16.p[2])); + v_int32 m10 = vx_setall_s32(m32[4]); + v_int32 m11 = vx_setall_s32(m32[5]); + int x = 0; + for (; x <= (len - v_uint8::nlanes) * nChannels; x += v_uint8::nlanes * nChannels) { - // load 8 pixels - v_int16x8 v0 = v_reinterpret_as_s16(v_load_expand(src + x)); - v_int16x8 v1 = v_reinterpret_as_s16(v_load_expand(src + x + cWidth)); - v_int16x8 v2 = v_reinterpret_as_s16(v_load_expand(src + x + cWidth * 2)); - v_int16x8 v3; - - // rotate and pack - v3 = v_rotate_right<1>(v2); // 0 b6 g6 r6 b7 g7 r7 0 - v2 = v_rotate_left <5>(v2, v1); // 0 b4 g4 r4 b5 g5 r5 0 - v1 = v_rotate_left <3>(v1, v0); // 0 b2 g2 r2 b3 g3 r3 0 - v0 = v_rotate_left <1>(v0); // 0 b0 g0 r0 b1 g1 r1 0 - - // multiply with matrix and normalize - v0 = v_matmulvec(v0, m0, m1, m2, m3, BITS); // 0 B0 G0 R0 B1 G1 R1 0 - v1 = v_matmulvec(v1, m0, m1, m2, m3, BITS); // 0 B2 G2 R2 B3 G3 R3 0 - v2 = v_matmulvec(v2, m0, m1, m2, m3, BITS); // 0 B4 G4 R4 B5 G5 R5 0 - v3 = v_matmulvec(v3, m0, m1, m2, m3, BITS); // 0 B6 G6 R6 B7 G7 R7 0 - - // narrow down as uint8x16 - v_uint8x16 z0 = v_pack_u(v0, v_setzero_s16()); // 0 B0 G0 R0 B1 G1 R1 0 0 0 0 0 0 0 0 0 - v_uint8x16 z1 = v_pack_u(v1, v_setzero_s16()); // 0 B2 G2 R2 B3 G3 R3 0 0 0 0 0 0 0 0 0 - v_uint8x16 z2 = v_pack_u(v2, v_setzero_s16()); // 0 B4 G4 R4 B5 G5 R5 0 0 0 0 0 0 0 0 0 - v_uint8x16 z3 = v_pack_u(v3, v_setzero_s16()); // 0 B6 G6 R6 B7 G7 R7 0 0 0 0 0 0 0 0 0 - - // rotate and pack - z0 = v_reinterpret_as_u8(v_reinterpret_as_u64(z0) >> 8) | v_reinterpret_as_u8(v_reinterpret_as_u64(z1) << 40); // B0 G0 R0 B1 G1 R1 B2 G2 0 0 0 0 0 0 0 0 - z1 = v_reinterpret_as_u8(v_reinterpret_as_u64(z1) >> 24) | v_reinterpret_as_u8(v_reinterpret_as_u64(z2) << 24); // R2 B3 G3 R3 B4 G4 R4 B5 0 0 0 0 0 0 0 0 - z2 = v_reinterpret_as_u8(v_reinterpret_as_u64(z2) >> 40) | v_reinterpret_as_u8(v_reinterpret_as_u64(z3) << 8); // G5 R6 B6 G6 R6 B7 G7 R7 0 0 0 0 0 0 0 0 - - // store on memory - v_store_low(dst + x, z0); - v_store_low(dst + x + cWidth, z1); - v_store_low(dst + x + cWidth * 2, z2); + v_uint8 b, g, r; + v_load_deinterleave(src + x, b, g, r); + v_uint8 bgl, bgh; + v_zip(b, g, bgl, bgh); + v_uint16 rl, rh; + v_expand(r, rl, rh); + + v_int16 dbl, dbh, dgl, dgh, drl, drh; + v_uint16 p0, p2; + v_int32 p1, p3; + v_expand(bgl, p0, p2); + v_expand(v_reinterpret_as_s16(rl), p1, p3); + dbl = v_rshr_pack(v_dotprod(v_reinterpret_as_s16(p0), m01) + p1 * m2 + m3, + v_dotprod(v_reinterpret_as_s16(p2), m01) + p3 * m2 + m3); + dgl = v_rshr_pack(v_dotprod(v_reinterpret_as_s16(p0), m45) + p1 * m6 + m7, + v_dotprod(v_reinterpret_as_s16(p2), m45) + p3 * m6 + m7); + drl = v_rshr_pack(v_dotprod(v_reinterpret_as_s16(p0), m89) + p1 * m10 + m11, + v_dotprod(v_reinterpret_as_s16(p2), m89) + p3 * m10 + m11); + v_expand(bgh, p0, p2); + v_expand(v_reinterpret_as_s16(rh), p1, p3); + dbh = v_rshr_pack(v_dotprod(v_reinterpret_as_s16(p0), m01) + p1 * m2 + m3, + v_dotprod(v_reinterpret_as_s16(p2), m01) + p3 * m2 + m3); + dgh = v_rshr_pack(v_dotprod(v_reinterpret_as_s16(p0), m45) + p1 * m6 + m7, + v_dotprod(v_reinterpret_as_s16(p2), m45) + p3 * m6 + m7); + drh = v_rshr_pack(v_dotprod(v_reinterpret_as_s16(p0), m89) + p1 * m10 + m11, + v_dotprod(v_reinterpret_as_s16(p2), m89) + p3 * m10 + m11); + v_store_interleave(dst + x, v_pack_u(dbl, dbh), v_pack_u(dgl, dgh), v_pack_u(drl, drh)); } - + m32[1] = saturate_cast((m[3] + 0.5f)*SCALE); + m32[3] = saturate_cast((m[7] + 0.5f)*SCALE); + m32[5] = saturate_cast((m[11] + 0.5f)*SCALE); for( ; x < len * nChannels; x += nChannels ) { int v0 = src[x], v1 = src[x+1], v2 = src[x+2]; - uchar t0 = saturate_cast((m00*v0 + m01*v1 + m02*v2 + m03)>>BITS); - uchar t1 = saturate_cast((m10*v0 + m11*v1 + m12*v2 + m13)>>BITS); - uchar t2 = saturate_cast((m20*v0 + m21*v1 + m22*v2 + m23)>>BITS); + uchar t0 = saturate_cast((m16.s[0] * v0 + m16.s[1] * v1 + m32[0] * v2 + m32[1]) >> BITS); + uchar t1 = saturate_cast((m16.s[2] * v0 + m16.s[3] * v1 + m32[2] * v2 + m32[3]) >> BITS); + uchar t2 = saturate_cast((m16.s[4] * v0 + m16.s[5] * v1 + m32[4] * v2 + m32[5]) >> BITS); dst[x] = t0; dst[x+1] = t1; dst[x+2] = t2; } + vx_cleanup(); return; } #endif @@ -1570,64 +1537,65 @@ transform_8u( const uchar* src, uchar* dst, const float* m, int len, int scn, in static void transform_16u( const ushort* src, ushort* dst, const float* m, int len, int scn, int dcn ) { -#if CV_SIMD128 && !defined(__aarch64__) - if( hasSIMD128() && scn == 3 && dcn == 3 ) +#if CV_SIMD && !defined(__aarch64__) + if( scn == 3 && dcn == 3 ) { - const int nChannels = 3; - const int cWidth = v_float32x4::nlanes; - v_int16x8 delta = v_int16x8(0, -32768, -32768, -32768, -32768, -32768, -32768, 0); - v_float32x4 m0, m1, m2, m3; - load3x3Matrix(m, m0, m1, m2, m3); - m3 -= v_float32x4(32768.f, 32768.f, 32768.f, 0.f); - int x = 0; - for( ; x <= (len - cWidth) * nChannels; x += cWidth * nChannels ) +#if CV_SIMD_WIDTH > 16 + v_float32 m0 = vx_setall_f32(m[ 0]); + v_float32 m1 = vx_setall_f32(m[ 1]); + v_float32 m2 = vx_setall_f32(m[ 2]); + v_float32 m3 = vx_setall_f32(m[ 3] - 32768.f); + v_float32 m4 = vx_setall_f32(m[ 4]); + v_float32 m5 = vx_setall_f32(m[ 5]); + v_float32 m6 = vx_setall_f32(m[ 6]); + v_float32 m7 = vx_setall_f32(m[ 7] - 32768.f); + v_float32 m8 = vx_setall_f32(m[ 8]); + v_float32 m9 = vx_setall_f32(m[ 9]); + v_float32 m10 = vx_setall_f32(m[10]); + v_float32 m11 = vx_setall_f32(m[11] - 32768.f); + v_int16 delta = vx_setall_s16(-32768); + for (; x <= (len - v_uint16::nlanes)*3; x += v_uint16::nlanes*3) { - // load 4 pixels - v_uint16x8 v0_16 = v_load(src + x); // b0 g0 r0 b1 g1 r1 b2 g2 - v_uint16x8 v2_16 = v_load_low(src + x + cWidth * 2); // r2 b3 g3 r3 ? ? ? ? - - // expand to 4 vectors - v_uint32x4 v0_32, v1_32, v2_32, v3_32, dummy_32; - v_expand(v_rotate_right<3>(v0_16), v1_32, dummy_32); // b1 g1 r1 - v_expand(v_rotate_right<1>(v2_16), v3_32, dummy_32); // b3 g3 r3 - v_expand(v_rotate_right<6>(v0_16, v2_16), v2_32, dummy_32); // b2 g2 r2 - v_expand(v0_16, v0_32, dummy_32); // b0 g0 r0 - - // convert to float32x4 - v_float32x4 x0 = v_cvt_f32(v_reinterpret_as_s32(v0_32)); // b0 g0 r0 - v_float32x4 x1 = v_cvt_f32(v_reinterpret_as_s32(v1_32)); // b1 g1 r1 - v_float32x4 x2 = v_cvt_f32(v_reinterpret_as_s32(v2_32)); // b2 g2 r2 - v_float32x4 x3 = v_cvt_f32(v_reinterpret_as_s32(v3_32)); // b3 g3 r3 - - // multiply and convert back to int32x4 - v_int32x4 y0, y1, y2, y3; - y0 = v_round(v_matmuladd(x0, m0, m1, m2, m3)); // B0 G0 R0 - y1 = v_round(v_matmuladd(x1, m0, m1, m2, m3)); // B1 G1 R1 - y2 = v_round(v_matmuladd(x2, m0, m1, m2, m3)); // B2 G2 R2 - y3 = v_round(v_matmuladd(x3, m0, m1, m2, m3)); // B3 G3 R3 - - // narrow down to int16x8 - v_int16x8 v0 = v_add_wrap(v_pack(v_rotate_left<1>(y0), y1), delta); // 0 B0 G0 R0 B1 G1 R1 0 - v_int16x8 v2 = v_add_wrap(v_pack(v_rotate_left<1>(y2), y3), delta); // 0 B2 G2 R2 B3 G3 R3 0 - - // rotate and pack - v0 = v_rotate_right<1>(v0) | v_rotate_left<5>(v2); // B0 G0 R0 B1 G1 R1 B2 G2 - v2 = v_rotate_right<3>(v2); // R2 B3 G3 R3 0 0 0 0 - - // store 4 pixels - v_store(dst + x, v_reinterpret_as_u16(v0)); - v_store_low(dst + x + cWidth * 2, v_reinterpret_as_u16(v2)); + v_uint16 b, g, r; + v_load_deinterleave(src + x, b, g, r); + v_uint32 bl, bh, gl, gh, rl, rh; + v_expand(b, bl, bh); + v_expand(g, gl, gh); + v_expand(r, rl, rh); + + v_int16 db, dg, dr; + db = v_add_wrap(v_pack(v_round(v_muladd(v_cvt_f32(v_reinterpret_as_s32(bl)), m0, v_muladd(v_cvt_f32(v_reinterpret_as_s32(gl)), m1, v_muladd(v_cvt_f32(v_reinterpret_as_s32(rl)), m2, m3)))), + v_round(v_muladd(v_cvt_f32(v_reinterpret_as_s32(bh)), m0, v_muladd(v_cvt_f32(v_reinterpret_as_s32(gh)), m1, v_muladd(v_cvt_f32(v_reinterpret_as_s32(rh)), m2, m3))))), delta); + dg = v_add_wrap(v_pack(v_round(v_muladd(v_cvt_f32(v_reinterpret_as_s32(bl)), m4, v_muladd(v_cvt_f32(v_reinterpret_as_s32(gl)), m5, v_muladd(v_cvt_f32(v_reinterpret_as_s32(rl)), m6, m7)))), + v_round(v_muladd(v_cvt_f32(v_reinterpret_as_s32(bh)), m4, v_muladd(v_cvt_f32(v_reinterpret_as_s32(gh)), m5, v_muladd(v_cvt_f32(v_reinterpret_as_s32(rh)), m6, m7))))), delta); + dr = v_add_wrap(v_pack(v_round(v_muladd(v_cvt_f32(v_reinterpret_as_s32(bl)), m8, v_muladd(v_cvt_f32(v_reinterpret_as_s32(gl)), m9, v_muladd(v_cvt_f32(v_reinterpret_as_s32(rl)), m10, m11)))), + v_round(v_muladd(v_cvt_f32(v_reinterpret_as_s32(bh)), m8, v_muladd(v_cvt_f32(v_reinterpret_as_s32(gh)), m9, v_muladd(v_cvt_f32(v_reinterpret_as_s32(rh)), m10, m11))))), delta); + v_store_interleave(dst + x, v_reinterpret_as_u16(db), v_reinterpret_as_u16(dg), v_reinterpret_as_u16(dr)); } - - for( ; x < len * nChannels; x += nChannels ) +#endif + v_float32x4 _m0l(m[0], m[4], m[ 8], 0.f); + v_float32x4 _m1l(m[1], m[5], m[ 9], 0.f); + v_float32x4 _m2l(m[2], m[6], m[10], 0.f); + v_float32x4 _m3l(m[3] - 32768.f, m[7] - 32768.f, m[11] - 32768.f, 0.f); + v_float32x4 _m0h = v_rotate_left<1>(_m0l); + v_float32x4 _m1h = v_rotate_left<1>(_m1l); + v_float32x4 _m2h = v_rotate_left<1>(_m2l); + v_float32x4 _m3h = v_rotate_left<1>(_m3l); + v_int16x8 _delta(0, -32768, -32768, -32768, -32768, -32768, -32768, 0); + for( ; x <= len*3 - v_uint16x8::nlanes; x += 3*v_uint16x8::nlanes/4 ) + v_store(dst + x, v_rotate_right<1>(v_reinterpret_as_u16(v_add_wrap(v_pack( + v_round(v_matmuladd(v_cvt_f32(v_reinterpret_as_s32(v_load_expand(src + x ))), _m0h, _m1h, _m2h, _m3h)), + v_round(v_matmuladd(v_cvt_f32(v_reinterpret_as_s32(v_load_expand(src + x + 3))), _m0l, _m1l, _m2l, _m3l))), _delta)))); + for( ; x < len * 3; x += 3 ) { float v0 = src[x], v1 = src[x + 1], v2 = src[x + 2]; - ushort t0 = saturate_cast(m[0] * v0 + m[1] * v1 + m[2] * v2 + m[3]); - ushort t1 = saturate_cast(m[4] * v0 + m[5] * v1 + m[6] * v2 + m[7]); + ushort t0 = saturate_cast(m[0] * v0 + m[1] * v1 + m[ 2] * v2 + m[ 3]); + ushort t1 = saturate_cast(m[4] * v0 + m[5] * v1 + m[ 6] * v2 + m[ 7]); ushort t2 = saturate_cast(m[8] * v0 + m[9] * v1 + m[10] * v2 + m[11]); dst[x] = t0; dst[x + 1] = t1; dst[x + 2] = t2; } + vx_cleanup(); return; } #endif @@ -1638,52 +1606,68 @@ transform_16u( const ushort* src, ushort* dst, const float* m, int len, int scn, static void transform_32f( const float* src, float* dst, const float* m, int len, int scn, int dcn ) { -#if CV_SIMD128 && !defined(__aarch64__) - if( hasSIMD128() ) +#if CV_SIMD && !defined(__aarch64__) + int x = 0; + if( scn == 3 && dcn == 3 ) { - int x = 0; - if( scn == 3 && dcn == 3 ) + int idx[v_float32::nlanes/2]; + for( int i = 0; i < v_float32::nlanes/4; i++ ) { - const int cWidth = 3; - v_float32x4 m0, m1, m2, m3; - load3x3Matrix(m, m0, m1, m2, m3); - - for( ; x < (len - 1)*cWidth; x += cWidth ) - { - v_float32x4 x0 = v_load(src + x); - v_float32x4 y0 = v_matmuladd(x0, m0, m1, m2, m3); - v_store_low(dst + x, y0); - dst[x + 2] = v_combine_high(y0, y0).get0(); - } - - for( ; x < len*cWidth; x += cWidth ) - { - float v0 = src[x], v1 = src[x+1], v2 = src[x+2]; - float t0 = saturate_cast(m[0]*v0 + m[1]*v1 + m[2]*v2 + m[3]); - float t1 = saturate_cast(m[4]*v0 + m[5]*v1 + m[6]*v2 + m[7]); - float t2 = saturate_cast(m[8]*v0 + m[9]*v1 + m[10]*v2 + m[11]); - dst[x] = t0; dst[x+1] = t1; dst[x+2] = t2; - } - return; + idx[i] = 3*i; + idx[i + v_float32::nlanes/4] = 0; + } + float _m[] = { m[0], m[4], m[ 8], 0.f, + m[1], m[5], m[ 9], 0.f, + m[2], m[6], m[10], 0.f, + m[3], m[7], m[11], 0.f }; + v_float32 m0 = vx_lut_quads(_m , idx + v_float32::nlanes/4); + v_float32 m1 = vx_lut_quads(_m + 4, idx + v_float32::nlanes/4); + v_float32 m2 = vx_lut_quads(_m + 8, idx + v_float32::nlanes/4); + v_float32 m3 = vx_lut_quads(_m + 12, idx + v_float32::nlanes/4); + for( ; x <= len*3 - v_float32::nlanes; x += 3*v_float32::nlanes/4 ) + v_store(dst + x, v_pack_triplets(v_matmuladd(vx_lut_quads(src + x, idx), m0, m1, m2, m3))); + for( ; x < len*3; x += 3 ) + { + float v0 = src[x], v1 = src[x+1], v2 = src[x+2]; + float t0 = saturate_cast(m[0]*v0 + m[1]*v1 + m[ 2]*v2 + m[ 3]); + float t1 = saturate_cast(m[4]*v0 + m[5]*v1 + m[ 6]*v2 + m[ 7]); + float t2 = saturate_cast(m[8]*v0 + m[9]*v1 + m[10]*v2 + m[11]); + dst[x] = t0; dst[x+1] = t1; dst[x+2] = t2; } + vx_cleanup(); + return; + } - if( scn == 4 && dcn == 4 ) + if( scn == 4 && dcn == 4 ) + { +#if CV_SIMD_WIDTH > 16 + int idx[v_float32::nlanes/4]; + for( int i = 0; i < v_float32::nlanes/4; i++ ) + idx[i] = 0; + float _m[] = { m[4], m[9], m[14], m[19] }; + v_float32 m0 = vx_lut_quads(m , idx); + v_float32 m1 = vx_lut_quads(m+ 5, idx); + v_float32 m2 = vx_lut_quads(m+10, idx); + v_float32 m3 = vx_lut_quads(m+15, idx); + v_float32 m4 = vx_lut_quads(_m, idx); + for( ; x <= len*4 - v_float32::nlanes; x += v_float32::nlanes ) { - const int cWidth = 4; - v_float32x4 m0 = v_float32x4(m[0], m[5], m[10], m[15]); - v_float32x4 m1 = v_float32x4(m[1], m[6], m[11], m[16]); - v_float32x4 m2 = v_float32x4(m[2], m[7], m[12], m[17]); - v_float32x4 m3 = v_float32x4(m[3], m[8], m[13], m[18]); - v_float32x4 m4 = v_float32x4(m[4], m[9], m[14], m[19]); - - for( ; x < len*cWidth; x += cWidth ) - { - v_float32x4 x0 = v_load(src + x); - v_float32x4 y0 = v_matmul(x0, m0, m1, m2, m3) + m4; - v_store(dst + x, y0); - } - return; + v_float32 v_src = vx_load(src + x); + v_store(dst + x, v_reduce_sum4(v_src * m0, v_src * m1, v_src * m2, v_src * m3) + m4); } +#endif + v_float32x4 _m0 = v_load(m ); + v_float32x4 _m1 = v_load(m + 5); + v_float32x4 _m2 = v_load(m + 10); + v_float32x4 _m3 = v_load(m + 15); + v_float32x4 _m4(m[4], m[9], m[14], m[19]); + for( ; x < len*4; x += v_float32x4::nlanes ) + { + v_float32x4 v_src = v_load(src + x); + v_store(dst + x, v_reduce_sum4(v_src * _m0, v_src * _m1, v_src * _m2, v_src * _m3) + _m4); + } + vx_cleanup(); + return; } #endif