diff --git a/modules/dnn/src/layers/convolution_layer.cpp b/modules/dnn/src/layers/convolution_layer.cpp index c8245c487d..63bd386119 100644 --- a/modules/dnn/src/layers/convolution_layer.cpp +++ b/modules/dnn/src/layers/convolution_layer.cpp @@ -1370,15 +1370,15 @@ public: v_float32x4 r2 = v_load_aligned(rptr + vsz_a*2); v_float32x4 r3 = v_load_aligned(rptr + vsz_a*3); - vs00 += w0*r0; - vs01 += w0*r1; - vs02 += w0*r2; - vs03 += w0*r3; - - vs10 += w1*r0; - vs11 += w1*r1; - vs12 += w1*r2; - vs13 += w1*r3; + vs00 = v_fma(w0, r0, vs00); + vs01 = v_fma(w0, r1, vs01); + vs02 = v_fma(w0, r2, vs02); + vs03 = v_fma(w0, r3, vs03); + + vs10 = v_fma(w1, r0, vs10); + vs11 = v_fma(w1, r1, vs11); + vs12 = v_fma(w1, r2, vs12); + vs13 = v_fma(w1, r3, vs13); } s0 += v_reduce_sum4(vs00, vs01, vs02, vs03); s1 += v_reduce_sum4(vs10, vs11, vs12, vs13); @@ -2035,20 +2035,21 @@ public: for( ; n <= nmax - 4; n += 4 ) { + v_float32x4 d0 = v_load(dst0 + n); + v_float32x4 d1 = v_load(dst1 + n); v_float32x4 b0 = v_load(bptr0 + n); v_float32x4 b1 = v_load(bptr1 + n); v_float32x4 b2 = v_load(bptr2 + n); v_float32x4 b3 = v_load(bptr3 + n); - v_float32x4 d0 = v_load(dst0 + n); - v_float32x4 d1 = v_load(dst1 + n); - d0 += b0*a00; - d1 += b0*a01; - d0 += b1*a10; - d1 += b1*a11; - d0 += b2*a20; - d1 += b2*a21; - d0 += b3*a30; - d1 += b3*a31; + // TODO try to improve pipeline width + d0 = v_fma(b0, a00, d0); + d1 = v_fma(b0, a01, d1); + d0 = v_fma(b1, a10, d0); + d1 = v_fma(b1, a11, d1); + d0 = v_fma(b2, a20, d0); + d1 = v_fma(b2, a21, d1); + d0 = v_fma(b3, a30, d0); + d1 = v_fma(b3, a31, d1); v_store(dst0 + n, d0); v_store(dst1 + n, d1); } @@ -2056,8 +2057,10 @@ public: for( ; n < nmax; n++ ) { - float b0 = bptr0[n], b1 = bptr1[n]; - float b2 = bptr2[n], b3 = bptr3[n]; + float b0 = bptr0[n]; + float b1 = bptr1[n]; + float b2 = bptr2[n]; + float b3 = bptr3[n]; float d0 = dst0[n] + alpha00*b0 + alpha10*b1 + alpha20*b2 + alpha30*b3; float d1 = dst1[n] + alpha01*b0 + alpha11*b1 + alpha21*b2 + alpha31*b3; dst0[n] = d0; diff --git a/modules/dnn/src/layers/fully_connected_layer.cpp b/modules/dnn/src/layers/fully_connected_layer.cpp index 4746403504..d9c6f6edb1 100644 --- a/modules/dnn/src/layers/fully_connected_layer.cpp +++ b/modules/dnn/src/layers/fully_connected_layer.cpp @@ -241,16 +241,18 @@ public: #if CV_SIMD128 for( ; i <= nw - 4; i += 4, wptr += 4*wstep ) { - v_float32x4 vs0 = v_setall_f32(0.f), vs1 = v_setall_f32(0.f); - v_float32x4 vs2 = v_setall_f32(0.f), vs3 = v_setall_f32(0.f); + v_float32x4 vs0 = v_setall_f32(0.f); + v_float32x4 vs1 = v_setall_f32(0.f); + v_float32x4 vs2 = v_setall_f32(0.f); + v_float32x4 vs3 = v_setall_f32(0.f); for( k = 0; k < vecsize; k += 4 ) { v_float32x4 v = v_load_aligned(sptr + k); - vs0 += v*v_load_aligned(wptr + k); - vs1 += v*v_load_aligned(wptr + wstep + k); - vs2 += v*v_load_aligned(wptr + wstep*2 + k); - vs3 += v*v_load_aligned(wptr + wstep*3 + k); + vs0 = v_fma(v, v_load_aligned(wptr + k), vs0); + vs1 = v_fma(v, v_load_aligned(wptr + wstep + k), vs1); + vs2 = v_fma(v, v_load_aligned(wptr + wstep*2 + k), vs2); + vs3 = v_fma(v, v_load_aligned(wptr + wstep*3 + k), vs3); } v_float32x4 s = v_reduce_sum4(vs0, vs1, vs2, vs3);