lavc/aacpsdsp: unroll R-V V stereo interpolate

pull/390/head
Rémi Denis-Courmont 2 years ago
parent 27d74fc1ef
commit c270928cc0
  1. 46
      libavcodec/riscv/aacpsdsp_rvv.S

@ -223,7 +223,7 @@ func ff_ps_hybrid_synthesis_deint_rvv, zve32x
endfunc endfunc
func ff_ps_stereo_interpolate_rvv, zve32f func ff_ps_stereo_interpolate_rvv, zve32f
vsetvli t0, zero, e32, m1, ta, ma vsetvli t0, zero, e32, m2, ta, ma
vid.v v24 vid.v v24
flw ft0, (a2) flw ft0, (a2)
vadd.vi v24, v24, 1 // v24[i] = i + 1 vadd.vi v24, v24, 1 // v24[i] = i + 1
@ -232,43 +232,43 @@ func ff_ps_stereo_interpolate_rvv, zve32f
flw ft2, 8(a2) flw ft2, 8(a2)
vfmv.v.f v16, ft0 vfmv.v.f v16, ft0
flw ft3, 12(a2) flw ft3, 12(a2)
vfmv.v.f v17, ft1 vfmv.v.f v18, ft1
flw ft0, (a3) flw ft0, (a3)
vfmv.v.f v18, ft2 vfmv.v.f v20, ft2
flw ft1, 4(a3) flw ft1, 4(a3)
vfmv.v.f v19, ft3 vfmv.v.f v22, ft3
flw ft2, 8(a3) flw ft2, 8(a3)
flw ft3, 12(a3) flw ft3, 12(a3)
fcvt.s.wu ft4, t0 // (float)(vlenb / sizeof (float)) fcvt.s.wu ft4, t0 // (float)(vlenb / sizeof (float))
vfmacc.vf v16, ft0, v24 // h0 += (i + 1) * h0_step vfmacc.vf v16, ft0, v24 // h0 += (i + 1) * h0_step
fmul.s ft0, ft0, ft4 fmul.s ft0, ft0, ft4
vfmacc.vf v17, ft1, v24 vfmacc.vf v18, ft1, v24
fmul.s ft1, ft1, ft4 fmul.s ft1, ft1, ft4
vfmacc.vf v18, ft2, v24 vfmacc.vf v20, ft2, v24
fmul.s ft2, ft2, ft4 fmul.s ft2, ft2, ft4
vfmacc.vf v19, ft3, v24 vfmacc.vf v22, ft3, v24
fmul.s ft3, ft3, ft4 fmul.s ft3, ft3, ft4
1: 1:
vsetvli t0, a4, e32, m1, ta, ma vsetvli t0, a4, e32, m2, ta, ma
vlseg2e32.v v8, (a0) // v8:l_re, v9:l_im vlseg2e32.v v0, (a0) // v0:l_re, v2:l_im
sub a4, a4, t0 sub a4, a4, t0
vlseg2e32.v v10, (a1) // v10:r_re, v11:r_im vlseg2e32.v v4, (a1) // v4:r_re, v6:r_im
vfmul.vv v12, v8, v16 vfmul.vv v8, v0, v16
vfmul.vv v13, v9, v16 vfmul.vv v10, v2, v16
vfmul.vv v14, v8, v17 vfmul.vv v12, v0, v18
vfmul.vv v15, v9, v17 vfmul.vv v14, v2, v18
vfmacc.vv v12, v10, v18 vfmacc.vv v8, v4, v20
vfmacc.vv v13, v11, v18 vfmacc.vv v10, v6, v20
vfmacc.vv v14, v10, v19 vfmacc.vv v12, v4, v22
vfmacc.vv v15, v11, v19 vfmacc.vv v14, v6, v22
vsseg2e32.v v12, (a0) vsseg2e32.v v8, (a0)
sh3add a0, t0, a0 sh3add a0, t0, a0
vsseg2e32.v v14, (a1) vsseg2e32.v v12, (a1)
sh3add a1, t0, a1 sh3add a1, t0, a1
vfadd.vf v16, v16, ft0 // h0 += (vlenb / sizeof (float)) * h0_step vfadd.vf v16, v16, ft0 // h0 += (vlenb / sizeof (float)) * h0_step
vfadd.vf v17, v17, ft1 vfadd.vf v18, v18, ft1
vfadd.vf v18, v18, ft2 vfadd.vf v20, v20, ft2
vfadd.vf v19, v19, ft3 vfadd.vf v22, v22, ft3
bnez a4, 1b bnez a4, 1b
ret ret

Loading…
Cancel
Save