mirror of https://github.com/opencv/opencv.git
Merge pull request #26155 from mshabunin:dnn-dispatch
dnn: use dispatching for Winograd optimizationspull/26461/head
commit
4866811933
6 changed files with 1059 additions and 1111 deletions
@ -0,0 +1,22 @@ |
|||||||
|
// This file is part of OpenCV project.
|
||||||
|
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||||
|
// of this distribution and at http://opencv.org/license.html.
|
||||||
|
|
||||||
|
#include "convolution.hpp" |
||||||
|
#include "conv_winograd_f63.simd.hpp" |
||||||
|
#include "layers/cpu_kernels/conv_winograd_f63.simd_declarations.hpp" |
||||||
|
|
||||||
|
namespace cv { |
||||||
|
namespace dnn { |
||||||
|
|
||||||
|
cv::dnn::Winofunc getWinofunc_F32() |
||||||
|
{ |
||||||
|
CV_CPU_DISPATCH(getWinofunc_F32, (), CV_CPU_DISPATCH_MODES_ALL); |
||||||
|
} |
||||||
|
|
||||||
|
cv::dnn::Winofunc getWinofunc_F16() |
||||||
|
{ |
||||||
|
CV_CPU_DISPATCH(getWinofunc_F16, (), CV_CPU_DISPATCH_MODES_ALL); |
||||||
|
} |
||||||
|
|
||||||
|
}} // namespace cv::dnn::
|
@ -1,476 +0,0 @@ |
|||||||
// This file is part of OpenCV project.
|
|
||||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
|
||||||
// of this distribution and at http://opencv.org/license.html.
|
|
||||||
|
|
||||||
#include "../../precomp.hpp" |
|
||||||
#include "convolution.hpp" |
|
||||||
#include "opencv2/core/hal/intrin.hpp" |
|
||||||
|
|
||||||
namespace cv { |
|
||||||
namespace dnn { |
|
||||||
|
|
||||||
// NEON code work around.
|
|
||||||
namespace opt_NEON |
|
||||||
{ |
|
||||||
|
|
||||||
#if CV_NEON && CV_NEON_AARCH64 |
|
||||||
|
|
||||||
/* Accumulate */ |
|
||||||
void winofunc_accum_F32(const float* inwptr, const float* wptr, float* outbuf, int Cg, int iblock, |
|
||||||
const int winoIblock, const int winoKblock, const int winoAtomF32, const int winoNatomF32) |
|
||||||
{ |
|
||||||
CV_Assert(winoIblock == 6 && winoKblock == 4 && winoAtomF32 == 4); |
|
||||||
if (iblock > 3) |
|
||||||
{ |
|
||||||
for (int atom_id = 0; atom_id < winoNatomF32; atom_id++, |
|
||||||
outbuf += winoAtomF32) |
|
||||||
{ |
|
||||||
float32x4_t s00 = vdupq_n_f32(0.f), s01 = s00, s02 = s00, s03 = s00, s04 = s00, s05 = s00; |
|
||||||
float32x4_t s10 = vdupq_n_f32(0.f), s11 = s00, s12 = s00, s13 = s00, s14 = s00, s15 = s00; |
|
||||||
float32x4_t s20 = vdupq_n_f32(0.f), s21 = s00, s22 = s00, s23 = s00, s24 = s00, s25 = s00; |
|
||||||
float32x4_t s30 = vdupq_n_f32(0.f), s31 = s00, s32 = s00, s33 = s00, s34 = s00, s35 = s00; |
|
||||||
for (int c = 0; c < Cg; c++, inwptr += winoIblock*winoAtomF32, |
|
||||||
wptr += winoKblock*winoAtomF32) { |
|
||||||
float32x4_t w0 = vld1q_f32(wptr), w1 = vld1q_f32(wptr + 4); |
|
||||||
float32x4_t w2 = vld1q_f32(wptr + 8), w3 = vld1q_f32(wptr + 12); |
|
||||||
float32x4_t x0, x1; |
|
||||||
x0 = vld1q_f32(inwptr); |
|
||||||
x1 = vld1q_f32(inwptr + 4); |
|
||||||
s00 = vfmaq_f32(s00, w0, x0); |
|
||||||
s01 = vfmaq_f32(s01, w0, x1); |
|
||||||
s10 = vfmaq_f32(s10, w1, x0); |
|
||||||
s11 = vfmaq_f32(s11, w1, x1); |
|
||||||
s20 = vfmaq_f32(s20, w2, x0); |
|
||||||
s21 = vfmaq_f32(s21, w2, x1); |
|
||||||
s30 = vfmaq_f32(s30, w3, x0); |
|
||||||
s31 = vfmaq_f32(s31, w3, x1); |
|
||||||
x0 = vld1q_f32(inwptr + 8); |
|
||||||
x1 = vld1q_f32(inwptr + 12); |
|
||||||
s02 = vfmaq_f32(s02, w0, x0); |
|
||||||
s03 = vfmaq_f32(s03, w0, x1); |
|
||||||
s12 = vfmaq_f32(s12, w1, x0); |
|
||||||
s13 = vfmaq_f32(s13, w1, x1); |
|
||||||
s22 = vfmaq_f32(s22, w2, x0); |
|
||||||
s23 = vfmaq_f32(s23, w2, x1); |
|
||||||
s32 = vfmaq_f32(s32, w3, x0); |
|
||||||
s33 = vfmaq_f32(s33, w3, x1); |
|
||||||
x0 = vld1q_f32(inwptr + 16); |
|
||||||
x1 = vld1q_f32(inwptr + 20); |
|
||||||
s04 = vfmaq_f32(s04, w0, x0); |
|
||||||
s05 = vfmaq_f32(s05, w0, x1); |
|
||||||
s14 = vfmaq_f32(s14, w1, x0); |
|
||||||
s15 = vfmaq_f32(s15, w1, x1); |
|
||||||
s24 = vfmaq_f32(s24, w2, x0); |
|
||||||
s25 = vfmaq_f32(s25, w2, x1); |
|
||||||
s34 = vfmaq_f32(s34, w3, x0); |
|
||||||
s35 = vfmaq_f32(s35, w3, x1); |
|
||||||
} |
|
||||||
|
|
||||||
vst1q_f32(outbuf, s00); |
|
||||||
vst1q_f32(outbuf + 1*64, s01); |
|
||||||
vst1q_f32(outbuf + 2*64, s02); |
|
||||||
vst1q_f32(outbuf + 3*64, s03); |
|
||||||
vst1q_f32(outbuf + 4*64, s04); |
|
||||||
vst1q_f32(outbuf + 5*64, s05); |
|
||||||
|
|
||||||
vst1q_f32(outbuf + 6*64, s10); |
|
||||||
vst1q_f32(outbuf + 7*64, s11); |
|
||||||
vst1q_f32(outbuf + 8*64, s12); |
|
||||||
vst1q_f32(outbuf + 9*64, s13); |
|
||||||
vst1q_f32(outbuf + 10*64, s14); |
|
||||||
vst1q_f32(outbuf + 11*64, s15); |
|
||||||
|
|
||||||
vst1q_f32(outbuf + 12*64, s20); |
|
||||||
vst1q_f32(outbuf + 13*64, s21); |
|
||||||
vst1q_f32(outbuf + 14*64, s22); |
|
||||||
vst1q_f32(outbuf + 15*64, s23); |
|
||||||
vst1q_f32(outbuf + 16*64, s24); |
|
||||||
vst1q_f32(outbuf + 17*64, s25); |
|
||||||
|
|
||||||
vst1q_f32(outbuf + 18*64, s30); |
|
||||||
vst1q_f32(outbuf + 19*64, s31); |
|
||||||
vst1q_f32(outbuf + 20*64, s32); |
|
||||||
vst1q_f32(outbuf + 21*64, s33); |
|
||||||
vst1q_f32(outbuf + 22*64, s34); |
|
||||||
vst1q_f32(outbuf + 23*64, s35); |
|
||||||
} |
|
||||||
} |
|
||||||
else |
|
||||||
{ |
|
||||||
for (int atom_id = 0; atom_id < winoNatomF32; atom_id++, |
|
||||||
outbuf += winoAtomF32) |
|
||||||
{ |
|
||||||
float32x4_t s00 = vdupq_n_f32(0.f), s01 = s00, s02 = s00; |
|
||||||
float32x4_t s10 = vdupq_n_f32(0.f), s11 = s00, s12 = s00; |
|
||||||
float32x4_t s20 = vdupq_n_f32(0.f), s21 = s00, s22 = s00; |
|
||||||
float32x4_t s30 = vdupq_n_f32(0.f), s31 = s00, s32 = s00; |
|
||||||
for (int c = 0; c < Cg; c++, inwptr += winoIblock*winoAtomF32, |
|
||||||
wptr += winoKblock*winoAtomF32) { |
|
||||||
float32x4_t w0 = vld1q_f32(wptr), w1 = vld1q_f32(wptr + 4); |
|
||||||
float32x4_t w2 = vld1q_f32(wptr + 8), w3 = vld1q_f32(wptr + 12); |
|
||||||
float32x4_t x0, x1, x2; |
|
||||||
x0 = vld1q_f32(inwptr); |
|
||||||
x1 = vld1q_f32(inwptr + 4); |
|
||||||
x2 = vld1q_f32(inwptr + 8); |
|
||||||
s00 = vfmaq_f32(s00, w0, x0); |
|
||||||
s01 = vfmaq_f32(s01, w0, x1); |
|
||||||
s02 = vfmaq_f32(s02, w0, x2); |
|
||||||
s10 = vfmaq_f32(s10, w1, x0); |
|
||||||
s11 = vfmaq_f32(s11, w1, x1); |
|
||||||
s12 = vfmaq_f32(s12, w1, x2); |
|
||||||
s20 = vfmaq_f32(s20, w2, x0); |
|
||||||
s21 = vfmaq_f32(s21, w2, x1); |
|
||||||
s22 = vfmaq_f32(s22, w2, x2); |
|
||||||
s30 = vfmaq_f32(s30, w3, x0); |
|
||||||
s31 = vfmaq_f32(s31, w3, x1); |
|
||||||
s32 = vfmaq_f32(s32, w3, x2); |
|
||||||
} |
|
||||||
|
|
||||||
vst1q_f32(outbuf, s00); |
|
||||||
vst1q_f32(outbuf + 1*64, s01); |
|
||||||
vst1q_f32(outbuf + 2*64, s02); |
|
||||||
vst1q_f32(outbuf + 6*64, s10); |
|
||||||
vst1q_f32(outbuf + 7*64, s11); |
|
||||||
vst1q_f32(outbuf + 8*64, s12); |
|
||||||
vst1q_f32(outbuf + 12*64, s20); |
|
||||||
vst1q_f32(outbuf + 13*64, s21); |
|
||||||
vst1q_f32(outbuf + 14*64, s22); |
|
||||||
vst1q_f32(outbuf + 18*64, s30); |
|
||||||
vst1q_f32(outbuf + 19*64, s31); |
|
||||||
vst1q_f32(outbuf + 20*64, s32); |
|
||||||
} |
|
||||||
} |
|
||||||
} |
|
||||||
|
|
||||||
#undef T4x4 |
|
||||||
#define T4x4(a, b, c, d, tr0, tr1) \ |
|
||||||
tr0 = vtrnq_f32(a, b); \
|
|
||||||
tr1 = vtrnq_f32(c, d); \
|
|
||||||
a = vcombine_f32(vget_low_f32(tr0.val[0]), vget_low_f32(tr1.val[0])); \
|
|
||||||
b = vcombine_f32(vget_low_f32(tr0.val[1]), vget_low_f32(tr1.val[1])); \
|
|
||||||
c = vcombine_f32(vget_high_f32(tr0.val[0]), vget_high_f32(tr1.val[0])); \
|
|
||||||
d = vcombine_f32(vget_high_f32(tr0.val[1]), vget_high_f32(tr1.val[1])) |
|
||||||
|
|
||||||
/*Input transform*/ |
|
||||||
void winofunc_BtXB_8x8_F32(const float* inptr, int inpstep, |
|
||||||
float* outptr, int Cg, const int winoIblock, const int winoAtomF32) |
|
||||||
{ |
|
||||||
float32x4_t x00 = vld1q_f32(inptr), x01 = vld1q_f32(inptr + 4); |
|
||||||
float32x4_t x10 = vld1q_f32(inptr + inpstep), x11 = vld1q_f32(inptr + inpstep + 4); |
|
||||||
float32x4_t x20 = vld1q_f32(inptr + inpstep*2), x21 = vld1q_f32(inptr + inpstep*2 + 4); |
|
||||||
float32x4_t x30 = vld1q_f32(inptr + inpstep*3), x31 = vld1q_f32(inptr + inpstep*3 + 4); |
|
||||||
float32x4_t x40 = vld1q_f32(inptr + inpstep*4), x41 = vld1q_f32(inptr + inpstep*4 + 4); |
|
||||||
float32x4_t x50 = vld1q_f32(inptr + inpstep*5), x51 = vld1q_f32(inptr + inpstep*5 + 4); |
|
||||||
float32x4_t x60 = vld1q_f32(inptr + inpstep*6), x61 = vld1q_f32(inptr + inpstep*6 + 4); |
|
||||||
float32x4_t x70 = vld1q_f32(inptr + inpstep*7), x71 = vld1q_f32(inptr + inpstep*7 + 4); |
|
||||||
|
|
||||||
float32x4_t z00, z01, z10, z11, z20, z21, z30, z31, z40, z41, z50, z51, z60, z61, z70, z71; |
|
||||||
|
|
||||||
{ |
|
||||||
/* Y[0] = [1.f, 0.f, -5.25f, 0.f, 5.25f, 0.f, -1.f, 0.f]*X */ |
|
||||||
/* Y[7] = [0.f, -1.f, 0.f, 5.25f, 0.f, -5.25f, 0.f, 1.f]*X */ |
|
||||||
float32x4_t q5_25 = vdupq_n_f32(5.25f), t00, t01, t10, t11; |
|
||||||
t00 = vsubq_f32(x40, x20); |
|
||||||
t01 = vsubq_f32(x41, x21); |
|
||||||
t10 = vsubq_f32(x30, x50); |
|
||||||
t11 = vsubq_f32(x31, x51); |
|
||||||
float32x4_t y00 = vfmaq_f32(vsubq_f32(x00, x60), t00, q5_25); |
|
||||||
float32x4_t y01 = vfmaq_f32(vsubq_f32(x01, x61), t01, q5_25); |
|
||||||
float32x4_t y70 = vfmaq_f32(vsubq_f32(x70, x10), t10, q5_25); |
|
||||||
float32x4_t y71 = vfmaq_f32(vsubq_f32(x71, x11), t11, q5_25); |
|
||||||
|
|
||||||
/* Y[1] = [0.f, 1.f, 1.f, -4.25f, -4.25f, 1.f, 1.f, 0.f]*X */ |
|
||||||
/* Y[2] = [0.f, -1.f, 1.f, 4.25f, -4.25f, -1.f, 1.f, 0.f]*X */ |
|
||||||
float32x4_t qm4_25 = vdupq_n_f32(-4.25f); |
|
||||||
t00 = vfmaq_f32(vaddq_f32(x10, x50), x30, qm4_25); |
|
||||||
t01 = vfmaq_f32(vaddq_f32(x11, x51), x31, qm4_25); |
|
||||||
t10 = vfmaq_f32(vaddq_f32(x20, x60), x40, qm4_25); |
|
||||||
t11 = vfmaq_f32(vaddq_f32(x21, x61), x41, qm4_25); |
|
||||||
|
|
||||||
float32x4_t y10 = vaddq_f32(t00, t10), y11 = vaddq_f32(t01, t11); |
|
||||||
float32x4_t y20 = vsubq_f32(t10, t00), y21 = vsubq_f32(t11, t01); |
|
||||||
|
|
||||||
/* Y[3] = [0.f, 0.5f, 0.25f, -2.5f, -1.25f, 2.f, 1.f, 0.f]*X */ |
|
||||||
/* Y[4] = [0.f, -0.5f, 0.25f, 2.5f, -1.25f, -2.f, 1.f, 0.f]*X */ |
|
||||||
float32x4_t q0_5 = vdupq_n_f32(0.5f), q0_25 = vdupq_n_f32(0.25f); |
|
||||||
float32x4_t qm2_5 = vdupq_n_f32(-2.5f), qm1_25 = vdupq_n_f32(-1.25f); |
|
||||||
t00 = vfmaq_f32(vaddq_f32(x50, x50), x10, q0_5); |
|
||||||
t01 = vfmaq_f32(vaddq_f32(x51, x51), x11, q0_5); |
|
||||||
t10 = vfmaq_f32(x60, x20, q0_25); |
|
||||||
t11 = vfmaq_f32(x61, x21, q0_25); |
|
||||||
t00 = vfmaq_f32(t00, x30, qm2_5); |
|
||||||
t01 = vfmaq_f32(t01, x31, qm2_5); |
|
||||||
t10 = vfmaq_f32(t10, x40, qm1_25); |
|
||||||
t11 = vfmaq_f32(t11, x41, qm1_25); |
|
||||||
|
|
||||||
float32x4_t y30 = vaddq_f32(t00, t10), y31 = vaddq_f32(t01, t11); |
|
||||||
float32x4_t y40 = vsubq_f32(t10, t00), y41 = vsubq_f32(t11, t01); |
|
||||||
|
|
||||||
/* Y[5] = [0.f, 2.f, 4.f, -2.5f, -5.f, 0.5f, 1.f, 0.f]*X */ |
|
||||||
/* Y[6] = [0.f, -2.f, 4.f, 2.5f, -5.f, -0.5f, 1.f, 0.f]*X */ |
|
||||||
float32x4_t q4 = vdupq_n_f32(4.f), qm5 = vdupq_n_f32(-5.f); |
|
||||||
t00 = vfmaq_f32(vaddq_f32(x10, x10), x50, q0_5); |
|
||||||
t01 = vfmaq_f32(vaddq_f32(x11, x11), x51, q0_5); |
|
||||||
t10 = vfmaq_f32(x60, x20, q4); |
|
||||||
t11 = vfmaq_f32(x61, x21, q4); |
|
||||||
t00 = vfmaq_f32(t00, x30, qm2_5); |
|
||||||
t01 = vfmaq_f32(t01, x31, qm2_5); |
|
||||||
t10 = vfmaq_f32(t10, x40, qm5); |
|
||||||
t11 = vfmaq_f32(t11, x41, qm5); |
|
||||||
|
|
||||||
float32x4_t y50 = vaddq_f32(t00, t10), y51 = vaddq_f32(t01, t11); |
|
||||||
float32x4_t y60 = vsubq_f32(t10, t00), y61 = vsubq_f32(t11, t01); |
|
||||||
|
|
||||||
/* transpose 8x8 matrix in-place with some renumeration of the elements: */ |
|
||||||
/* Y: */ |
|
||||||
/* y00 y01 */ |
|
||||||
/* y10 y11 */ |
|
||||||
/* ... */ |
|
||||||
/* y70 y71 */ |
|
||||||
/* Y': */ |
|
||||||
/* y00 y40 */ |
|
||||||
/* y10 y50 */ |
|
||||||
/* y20 y60 */ |
|
||||||
/* y30 y70 */ |
|
||||||
/* y01 y41 */ |
|
||||||
/* y11 y51 */ |
|
||||||
/* y21 y61 */ |
|
||||||
/* y31 y71 */ |
|
||||||
/* in other words, y40 <-> y01, y50 <-> y11, y60 <-> y21, y70 <-> y31 */ |
|
||||||
float32x4x2_t tr0, tr1; |
|
||||||
|
|
||||||
T4x4(y00, y10, y20, y30, tr0, tr1); |
|
||||||
T4x4(y01, y11, y21, y31, tr0, tr1); |
|
||||||
T4x4(y40, y50, y60, y70, tr0, tr1); |
|
||||||
T4x4(y41, y51, y61, y71, tr0, tr1); |
|
||||||
|
|
||||||
/* Z[0] = [1.f, 0.f, -5.25f, 0.f, 5.25f, 0.f, -1.f, 0.f]*Y */ |
|
||||||
/* Z[7] = [0.f, -1.f, 0.f, 5.25f, 0.f, -5.25f, 0.f, 1.f]*Y */ |
|
||||||
t00 = vsubq_f32(y01, y20); |
|
||||||
t01 = vsubq_f32(y41, y60); |
|
||||||
t10 = vsubq_f32(y30, y11); |
|
||||||
t11 = vsubq_f32(y70, y51); |
|
||||||
z00 = vfmaq_f32(vsubq_f32(y00, y21), t00, q5_25); |
|
||||||
z01 = vfmaq_f32(vsubq_f32(y40, y61), t01, q5_25); |
|
||||||
z70 = vfmaq_f32(vsubq_f32(y31, y10), t10, q5_25); |
|
||||||
z71 = vfmaq_f32(vsubq_f32(y71, y50), t11, q5_25); |
|
||||||
|
|
||||||
/* Z[1] = [0.f, 1.f, 1.f, -4.25f, -4.25f, 1.f, 1.f, 0.f]*Y */ |
|
||||||
/* Z[2] = [0.f, -1.f, 1.f, 4.25f, -4.25f, -1.f, 1.f, 0.f]*Y */ |
|
||||||
t00 = vfmaq_f32(vaddq_f32(y10, y11), y30, qm4_25); |
|
||||||
t01 = vfmaq_f32(vaddq_f32(y50, y51), y70, qm4_25); |
|
||||||
t10 = vfmaq_f32(vaddq_f32(y20, y21), y01, qm4_25); |
|
||||||
t11 = vfmaq_f32(vaddq_f32(y60, y61), y41, qm4_25); |
|
||||||
|
|
||||||
z10 = vaddq_f32(t00, t10); z11 = vaddq_f32(t01, t11); |
|
||||||
z20 = vsubq_f32(t10, t00); z21 = vsubq_f32(t11, t01); |
|
||||||
|
|
||||||
/* Z[3] = [0.f, 0.5f, 0.25f, -2.5f, -1.25f, 2.f, 1.f, 0.f]*Y */ |
|
||||||
/* Z[4] = [0.f, -0.5f, 0.25f, 2.5f, -1.25f, -2.f, 1.f, 0.f]*Y */ |
|
||||||
t00 = vfmaq_f32(vaddq_f32(y11, y11), y10, q0_5); |
|
||||||
t01 = vfmaq_f32(vaddq_f32(y51, y51), y50, q0_5); |
|
||||||
t10 = vfmaq_f32(y21, y20, q0_25); |
|
||||||
t11 = vfmaq_f32(y61, y60, q0_25); |
|
||||||
t00 = vfmaq_f32(t00, y30, qm2_5); |
|
||||||
t01 = vfmaq_f32(t01, y70, qm2_5); |
|
||||||
t10 = vfmaq_f32(t10, y01, qm1_25); |
|
||||||
t11 = vfmaq_f32(t11, y41, qm1_25); |
|
||||||
|
|
||||||
z30 = vaddq_f32(t00, t10); z31 = vaddq_f32(t01, t11); |
|
||||||
z40 = vsubq_f32(t10, t00); z41 = vsubq_f32(t11, t01); |
|
||||||
|
|
||||||
/* Z[5] = [0.f, 2.f, 4.f, -2.5f, -5.f, 0.5f, 1.f, 0.f]*Y */ |
|
||||||
/* Z[6] = [0.f, -2.f, 4.f, 2.5f, -5.f, -0.5f, 1.f, 0.f]*Y */ |
|
||||||
t00 = vfmaq_f32(vaddq_f32(y10, y10), y11, q0_5); |
|
||||||
t01 = vfmaq_f32(vaddq_f32(y50, y50), y51, q0_5); |
|
||||||
t10 = vfmaq_f32(y21, y20, q4); |
|
||||||
t11 = vfmaq_f32(y61, y60, q4); |
|
||||||
t00 = vfmaq_f32(t00, y30, qm2_5); |
|
||||||
t01 = vfmaq_f32(t01, y70, qm2_5); |
|
||||||
t10 = vfmaq_f32(t10, y01, qm5); |
|
||||||
t11 = vfmaq_f32(t11, y41, qm5); |
|
||||||
|
|
||||||
z50 = vaddq_f32(t00, t10); z51 = vaddq_f32(t01, t11); |
|
||||||
z60 = vsubq_f32(t10, t00); z61 = vsubq_f32(t11, t01); |
|
||||||
} |
|
||||||
|
|
||||||
const int outstep = winoIblock*winoAtomF32*Cg; |
|
||||||
|
|
||||||
vst1q_f32(outptr, z00); |
|
||||||
vst1q_f32(outptr + outstep, z01); |
|
||||||
vst1q_f32(outptr + outstep*2, z10); |
|
||||||
vst1q_f32(outptr + outstep*3, z11); |
|
||||||
vst1q_f32(outptr + outstep*4, z20); |
|
||||||
vst1q_f32(outptr + outstep*5, z21); |
|
||||||
vst1q_f32(outptr + outstep*6, z30); |
|
||||||
vst1q_f32(outptr + outstep*7, z31); |
|
||||||
vst1q_f32(outptr + outstep*8, z40); |
|
||||||
vst1q_f32(outptr + outstep*9, z41); |
|
||||||
vst1q_f32(outptr + outstep*10, z50); |
|
||||||
vst1q_f32(outptr + outstep*11, z51); |
|
||||||
vst1q_f32(outptr + outstep*12, z60); |
|
||||||
vst1q_f32(outptr + outstep*13, z61); |
|
||||||
vst1q_f32(outptr + outstep*14, z70); |
|
||||||
vst1q_f32(outptr + outstep*15, z71); |
|
||||||
} |
|
||||||
|
|
||||||
/*Output transform*/ |
|
||||||
void winofunc_AtXA_8x8_F32(const float* inptr, int inpstep, |
|
||||||
float* bpptr, int bpstep, float* outptr, int outstep, |
|
||||||
float bias, float minval, float maxval, bool ifMinMaxAct) |
|
||||||
{ |
|
||||||
float32x4_t x00 = vld1q_f32(inptr), x01 = vld1q_f32(inptr + 4); |
|
||||||
float32x4_t x10 = vld1q_f32(inptr + inpstep), x11 = vld1q_f32(inptr + inpstep + 4); |
|
||||||
float32x4_t x20 = vld1q_f32(inptr + inpstep*2), x21 = vld1q_f32(inptr + inpstep*2 + 4); |
|
||||||
float32x4_t x30 = vld1q_f32(inptr + inpstep*3), x31 = vld1q_f32(inptr + inpstep*3 + 4); |
|
||||||
float32x4_t x40 = vld1q_f32(inptr + inpstep*4), x41 = vld1q_f32(inptr + inpstep*4 + 4); |
|
||||||
float32x4_t x50 = vld1q_f32(inptr + inpstep*5), x51 = vld1q_f32(inptr + inpstep*5 + 4); |
|
||||||
float32x4_t x60 = vld1q_f32(inptr + inpstep*6), x61 = vld1q_f32(inptr + inpstep*6 + 4); |
|
||||||
float32x4_t x70 = vld1q_f32(inptr + inpstep*7), x71 = vld1q_f32(inptr + inpstep*7 + 4); |
|
||||||
float32x4_t z00, z01, z10, z11, z20, z21, z30, z31, z40, z41, z50, z51; |
|
||||||
|
|
||||||
{ |
|
||||||
float32x4_t s12_0, s12_1, s34_0, s34_1, s56_0, s56_1; |
|
||||||
s12_0 = vaddq_f32(x10, x20); s12_1 = vaddq_f32(x11, x21); |
|
||||||
s34_0 = vaddq_f32(x30, x40); s34_1 = vaddq_f32(x31, x41); |
|
||||||
s56_0 = vaddq_f32(x50, x60); s56_1 = vaddq_f32(x51, x61); |
|
||||||
|
|
||||||
float32x4_t y00 = vaddq_f32(vaddq_f32(vaddq_f32(x00, s12_0), s34_0), s56_0); |
|
||||||
float32x4_t y01 = vaddq_f32(vaddq_f32(vaddq_f32(x01, s12_1), s34_1), s56_1); |
|
||||||
float32x4_t y20 = vfmaq_n_f32(vfmaq_n_f32(s12_0, s34_0, 4.0f), s56_0, 0.25f); |
|
||||||
float32x4_t y21 = vfmaq_n_f32(vfmaq_n_f32(s12_1, s34_1, 4.0f), s56_1, 0.25f); |
|
||||||
float32x4_t y40 = vfmaq_n_f32(vfmaq_n_f32(s12_0, s34_0, 16.0f), s56_0, 1.f/16); |
|
||||||
float32x4_t y41 = vfmaq_n_f32(vfmaq_n_f32(s12_1, s34_1, 16.0f), s56_1, 1.f/16); |
|
||||||
|
|
||||||
s12_0 = vsubq_f32(x10, x20); s12_1 = vsubq_f32(x11, x21); |
|
||||||
s34_0 = vsubq_f32(x30, x40); s34_1 = vsubq_f32(x31, x41); |
|
||||||
s56_0 = vsubq_f32(x50, x60); s56_1 = vsubq_f32(x51, x61); |
|
||||||
|
|
||||||
float32x4_t y50 = vfmaq_n_f32(vfmaq_n_f32(vaddq_f32(x70, s12_0), |
|
||||||
s34_0, 32.f), s56_0, 1.f/32); |
|
||||||
float32x4_t y51 = vfmaq_n_f32(vfmaq_n_f32(vaddq_f32(x71, s12_1), |
|
||||||
s34_1, 32.f), s56_1, 1.f/32); |
|
||||||
float32x4_t y10 = vfmaq_n_f32(vfmaq_n_f32(s12_0, s34_0, 2.0f), s56_0, 0.5f); |
|
||||||
float32x4_t y11 = vfmaq_n_f32(vfmaq_n_f32(s12_1, s34_1, 2.0f), s56_1, 0.5f); |
|
||||||
float32x4_t y30 = vfmaq_n_f32(vfmaq_n_f32(s12_0, s34_0, 8.0f), s56_0, 0.125f); |
|
||||||
float32x4_t y31 = vfmaq_n_f32(vfmaq_n_f32(s12_1, s34_1, 8.0f), s56_1, 0.125f); |
|
||||||
float32x4_t y60 = vdupq_n_f32(0.f), y61 = y60, y70 = y60, y71 = y60; |
|
||||||
|
|
||||||
/* transpose 8x8 matrix in-place with some renumeration of the elements: */ |
|
||||||
/* Y: */ |
|
||||||
/* y00 y01 */ |
|
||||||
/* y10 y11 */ |
|
||||||
/* ... */ |
|
||||||
/* y50 y51 */ |
|
||||||
/* 0 0 */ |
|
||||||
/* 0 0 */ |
|
||||||
/* Y': */ |
|
||||||
/* y00 y40 */ |
|
||||||
/* y10 y50 */ |
|
||||||
/* y20 y60 */ |
|
||||||
/* y30 y70 */ |
|
||||||
/* y01 y41 */ |
|
||||||
/* y11 y51 */ |
|
||||||
/* y21 y61 */ |
|
||||||
/* y31 y71 */ |
|
||||||
/* in other words, y40 <-> y01, y50 <-> y11, y60 <-> y21, y70 <-> y31 */ |
|
||||||
float32x4x2_t tr0, tr1; |
|
||||||
|
|
||||||
T4x4(y00, y10, y20, y30, tr0, tr1); |
|
||||||
T4x4(y01, y11, y21, y31, tr0, tr1); |
|
||||||
T4x4(y40, y50, y60, y70, tr0, tr1); |
|
||||||
T4x4(y41, y51, y61, y71, tr0, tr1); |
|
||||||
|
|
||||||
s12_0 = vaddq_f32(y10, y20); s12_1 = vaddq_f32(y50, y60); |
|
||||||
s34_0 = vaddq_f32(y30, y01); s34_1 = vaddq_f32(y70, y41); |
|
||||||
s56_0 = vaddq_f32(y11, y21); s56_1 = vaddq_f32(y51, y61); |
|
||||||
|
|
||||||
z00 = vaddq_f32(vaddq_f32(vaddq_f32(y00, s12_0), s34_0), s56_0); |
|
||||||
z01 = vaddq_f32(vaddq_f32(vaddq_f32(y40, s12_1), s34_1), s56_1); |
|
||||||
z20 = vfmaq_n_f32(vfmaq_n_f32(s12_0, s34_0, 4.0f), s56_0, 0.25f); |
|
||||||
z21 = vfmaq_n_f32(vfmaq_n_f32(s12_1, s34_1, 4.0f), s56_1, 0.25f); |
|
||||||
z40 = vfmaq_n_f32(vfmaq_n_f32(s12_0, s34_0, 16.0f), s56_0, 1.f/16); |
|
||||||
z41 = vfmaq_n_f32(vfmaq_n_f32(s12_1, s34_1, 16.0f), s56_1, 1.f/16); |
|
||||||
|
|
||||||
s12_0 = vsubq_f32(y10, y20); s12_1 = vsubq_f32(y50, y60); |
|
||||||
s34_0 = vsubq_f32(y30, y01); s34_1 = vsubq_f32(y70, y41); |
|
||||||
s56_0 = vsubq_f32(y11, y21); s56_1 = vsubq_f32(y51, y61); |
|
||||||
|
|
||||||
z50 = vfmaq_n_f32(vfmaq_n_f32(vaddq_f32(y31, s12_0), |
|
||||||
s34_0, 32.f), s56_0, 1.f/32); |
|
||||||
z51 = vfmaq_n_f32(vfmaq_n_f32(vaddq_f32(y71, s12_1), |
|
||||||
s34_1, 32.f), s56_1, 1.f/32); |
|
||||||
z10 = vfmaq_n_f32(vfmaq_n_f32(s12_0, s34_0, 2.0f), s56_0, 0.5f); |
|
||||||
z11 = vfmaq_n_f32(vfmaq_n_f32(s12_1, s34_1, 2.0f), s56_1, 0.5f); |
|
||||||
z30 = vfmaq_n_f32(vfmaq_n_f32(s12_0, s34_0, 8.0f), s56_0, 0.125f); |
|
||||||
z31 = vfmaq_n_f32(vfmaq_n_f32(s12_1, s34_1, 8.0f), s56_1, 0.125f); |
|
||||||
float32x4_t vbias = vdupq_n_f32(bias); |
|
||||||
|
|
||||||
z00 = vaddq_f32(z00, vbias); |
|
||||||
z01 = vaddq_f32(z01, vbias); |
|
||||||
z10 = vaddq_f32(z10, vbias); |
|
||||||
z11 = vaddq_f32(z11, vbias); |
|
||||||
z20 = vaddq_f32(z20, vbias); |
|
||||||
z21 = vaddq_f32(z21, vbias); |
|
||||||
z30 = vaddq_f32(z30, vbias); |
|
||||||
z31 = vaddq_f32(z31, vbias); |
|
||||||
z40 = vaddq_f32(z40, vbias); |
|
||||||
z41 = vaddq_f32(z41, vbias); |
|
||||||
z50 = vaddq_f32(z50, vbias); |
|
||||||
z51 = vaddq_f32(z51, vbias); |
|
||||||
} |
|
||||||
|
|
||||||
if (bpptr) |
|
||||||
{ |
|
||||||
float32x2_t zhalf = vdup_n_f32(0.f); |
|
||||||
z00 = vaddq_f32(z00, vld1q_f32(bpptr)); |
|
||||||
z01 = vaddq_f32(z01, vcombine_f32(vld1_f32(bpptr + 4), zhalf)); |
|
||||||
z10 = vaddq_f32(z10, vld1q_f32(bpptr + bpstep)); |
|
||||||
z11 = vaddq_f32(z11, vcombine_f32(vld1_f32(bpptr + bpstep + 4), zhalf)); |
|
||||||
z20 = vaddq_f32(z20, vld1q_f32(bpptr + bpstep*2)); |
|
||||||
z21 = vaddq_f32(z21, vcombine_f32(vld1_f32(bpptr + bpstep*2 + 4), zhalf)); |
|
||||||
z30 = vaddq_f32(z30, vld1q_f32(bpptr + bpstep*3)); |
|
||||||
z31 = vaddq_f32(z31, vcombine_f32(vld1_f32(bpptr + bpstep*3 + 4), zhalf)); |
|
||||||
z40 = vaddq_f32(z40, vld1q_f32(bpptr + bpstep*4)); |
|
||||||
z41 = vaddq_f32(z41, vcombine_f32(vld1_f32(bpptr + bpstep*4 + 4), zhalf)); |
|
||||||
z50 = vaddq_f32(z50, vld1q_f32(bpptr + bpstep*5)); |
|
||||||
z51 = vaddq_f32(z51, vcombine_f32(vld1_f32(bpptr + bpstep*5 + 4), zhalf)); |
|
||||||
} |
|
||||||
|
|
||||||
if (ifMinMaxAct) |
|
||||||
{ |
|
||||||
float32x4_t vmax = vdupq_n_f32(maxval); |
|
||||||
float32x4_t vmin = vdupq_n_f32(minval); |
|
||||||
|
|
||||||
z00 = vminq_f32(vmaxq_f32(z00, vmin), vmax); |
|
||||||
z01 = vminq_f32(vmaxq_f32(z01, vmin), vmax); |
|
||||||
z10 = vminq_f32(vmaxq_f32(z10, vmin), vmax); |
|
||||||
z11 = vminq_f32(vmaxq_f32(z11, vmin), vmax); |
|
||||||
z20 = vminq_f32(vmaxq_f32(z20, vmin), vmax); |
|
||||||
z21 = vminq_f32(vmaxq_f32(z21, vmin), vmax); |
|
||||||
z30 = vminq_f32(vmaxq_f32(z30, vmin), vmax); |
|
||||||
z31 = vminq_f32(vmaxq_f32(z31, vmin), vmax); |
|
||||||
z40 = vminq_f32(vmaxq_f32(z40, vmin), vmax); |
|
||||||
z41 = vminq_f32(vmaxq_f32(z41, vmin), vmax); |
|
||||||
z50 = vminq_f32(vmaxq_f32(z50, vmin), vmax); |
|
||||||
z51 = vminq_f32(vmaxq_f32(z51, vmin), vmax); |
|
||||||
} |
|
||||||
|
|
||||||
vst1q_f32(outptr, z00); |
|
||||||
vst1_f32(outptr + 4, vget_low_f32(z01)); |
|
||||||
vst1q_f32(outptr + outstep, z10); |
|
||||||
vst1_f32(outptr + outstep + 4, vget_low_f32(z11)); |
|
||||||
vst1q_f32(outptr + outstep*2, z20); |
|
||||||
vst1_f32(outptr + outstep*2 + 4, vget_low_f32(z21)); |
|
||||||
vst1q_f32(outptr + outstep*3, z30); |
|
||||||
vst1_f32(outptr + outstep*3 + 4, vget_low_f32(z31)); |
|
||||||
vst1q_f32(outptr + outstep*4, z40); |
|
||||||
vst1_f32(outptr + outstep*4 + 4, vget_low_f32(z41)); |
|
||||||
vst1q_f32(outptr + outstep*5, z50); |
|
||||||
vst1_f32(outptr + outstep*5 + 4, vget_low_f32(z51)); |
|
||||||
} |
|
||||||
|
|
||||||
#endif |
|
||||||
} |
|
||||||
|
|
||||||
}} // namespace
|
|
File diff suppressed because it is too large
Load Diff
Loading…
Reference in new issue