diff --git a/libavcodec/riscv/sbrdsp_init.c b/libavcodec/riscv/sbrdsp_init.c index 71de681185..c1ed5b639c 100644 --- a/libavcodec/riscv/sbrdsp_init.c +++ b/libavcodec/riscv/sbrdsp_init.c @@ -26,6 +26,7 @@ void ff_sbr_sum64x5_rvv(float *z); float ff_sbr_sum_square_rvv(float (*x)[2], int n); void ff_sbr_neg_odd_64_rvv(float *x); +void ff_sbr_autocorrelate_rvv(const float x[40][2], float phi[3][2][2]); void ff_sbr_hf_g_filt_rvv(float (*Y)[2], const float (*X_high)[40][2], const float *g_filt, int m_max, intptr_t ixh); @@ -34,10 +35,13 @@ av_cold void ff_sbrdsp_init_riscv(SBRDSPContext *c) #if HAVE_RVV int flags = av_get_cpu_flags(); - if ((flags & AV_CPU_FLAG_RVV_F32) && (flags & AV_CPU_FLAG_RVB_ADDR)) { - c->sum64x5 = ff_sbr_sum64x5_rvv; - c->sum_square = ff_sbr_sum_square_rvv; - c->hf_g_filt = ff_sbr_hf_g_filt_rvv; + if (flags & AV_CPU_FLAG_RVV_F32) { + if (flags & AV_CPU_FLAG_RVB_ADDR) { + c->sum64x5 = ff_sbr_sum64x5_rvv; + c->sum_square = ff_sbr_sum_square_rvv; + c->hf_g_filt = ff_sbr_hf_g_filt_rvv; + } + c->autocorrelate = ff_sbr_autocorrelate_rvv; } #if __riscv_xlen >= 64 if ((flags & AV_CPU_FLAG_RVV_I64) && (flags & AV_CPU_FLAG_RVB_ADDR)) diff --git a/libavcodec/riscv/sbrdsp_rvv.S b/libavcodec/riscv/sbrdsp_rvv.S index 932a5dd7d1..2f3a0969d7 100644 --- a/libavcodec/riscv/sbrdsp_rvv.S +++ b/libavcodec/riscv/sbrdsp_rvv.S @@ -85,6 +85,95 @@ func ff_sbr_neg_odd_64_rvv, zve64x endfunc #endif +func ff_sbr_autocorrelate_rvv, zve32f + vsetvli t0, zero, e32, m4, ta, ma + vmv.v.x v0, zero + flw fa0, (a0) + vmv.v.x v4, zero + flw fa1, 4(a0) + vmv.v.x v8, zero + flw fa2, 8(a0) + li a2, 37 + flw fa3, 12(a0) + fmul.s ft10, fa0, fa0 + flw fa4, 16(a0) + fmul.s ft6, fa0, fa2 + flw fa5, 20(a0) + addi a0, a0, 38 * 8 + fmul.s ft7, fa0, fa3 + fmul.s ft2, fa0, fa4 + fmul.s ft3, fa0, fa5 + flw fa0, (a0) + fmadd.s ft10, fa1, fa1, ft10 + fmadd.s ft6, fa1, fa3, ft6 + flw fa3, 12(a0) + fnmsub.s ft7, fa1, fa2, ft7 + flw fa2, 8(a0) + fmadd.s ft2, fa1, fa5, ft2 + fnmsub.s ft3, fa1, fa4, ft3 + flw fa1, 4(a0) + fmul.s ft4, fa0, fa0 + fmul.s ft0, fa0, fa2 + fmul.s ft1, fa0, fa3 + fmadd.s ft4, fa1, fa1, ft4 + fmadd.s ft0, fa1, fa3, ft0 + fnmsub.s ft1, fa1, fa2, ft1 +1: + vsetvli t0, a2, e32, m2, tu, ma + slli t1, t0, 3 + sub a0, a0, t1 + vlseg2e32.v v16, (a0) + sub a2, a2, t0 + vfmacc.vv v0, v16, v16 + vfslide1down.vf v20, v16, fa0 + vfmacc.vv v4, v16, v20 + vfslide1down.vf v22, v18, fa1 + vfmacc.vv v0, v18, v18 + vfslide1down.vf v24, v20, fa2 + vfmacc.vv v4, v18, v22 + vfslide1down.vf v26, v22, fa3 + vfmacc.vv v6, v16, v22 + vfmv.f.s fa0, v16 + vfmacc.vv v8, v16, v24 + vfmv.f.s fa1, v18 + vfmacc.vv v10, v16, v26 + vfmv.f.s fa2, v20 + vfnmsac.vv v6, v18, v20 + vfmv.f.s fa3, v22 + vfmacc.vv v8, v18, v26 + vfnmsac.vv v10, v18, v24 + bnez a2, 1b + + vsetvli t0, zero, e32, m2, ta, ma + vfredusum.vs v0, v0, v2 + vfredusum.vs v4, v4, v2 + vfmv.f.s fa0, v0 + vfredusum.vs v6, v6, v2 + vfmv.f.s fa2, v4 + fadd.s ft4, ft4, fa0 + vfredusum.vs v8, v8, v2 + vfmv.f.s fa3, v6 + fadd.s ft0, ft0, fa2 + vfredusum.vs v10, v10, v2 + vfmv.f.s fa4, v8 + fadd.s ft1, ft1, fa3 + vfmv.f.s fa5, v10 + fsw ft0, (a1) + fadd.s ft2, ft2, fa4 + fsw ft1, 4(a1) + fadd.s ft3, ft3, fa5 + fsw ft2, 8(a1) + fadd.s ft6, ft6, fa2 + fsw ft3, 12(a1) + fadd.s ft7, ft7, fa3 + fsw ft4, 16(a1) + fadd.s ft10, ft10, fa0 + fsw ft6, 24(a1) + fsw ft7, 28(a1) + fsw ft10, 40(a1) + ret +endfunc + func ff_sbr_hf_g_filt_rvv, zve32f li t1, 40 * 2 * 4 sh3add a1, a4, a1