lavc/aacpsdsp: rework R-V V add_squares

Segmented loads may be slower than not. So this advantageously uses a
unit-strided load and narrowing shifts instead.

Before:
ps_add_squares_c: 60757.7
ps_add_squares_rvv_f32: 22242.5

After:
ps_add_squares_c: 60516.0
ps_add_squares_rvv_i64: 17067.7
release/7.0
Rémi Denis-Courmont 2 years ago
parent ab78d22553
commit eb508702a8
  1. 3
      libavcodec/riscv/aacpsdsp_init.c
  2. 9
      libavcodec/riscv/aacpsdsp_rvv.S

@ -46,7 +46,8 @@ av_cold void ff_psdsp_init_riscv(PSDSPContext *c)
c->hybrid_analysis = ff_ps_hybrid_analysis_rvv;
if (flags & AV_CPU_FLAG_RVB_ADDR) {
c->add_squares = ff_ps_add_squares_rvv;
if (flags & AV_CPU_FLAG_RVV_I64)
c->add_squares = ff_ps_add_squares_rvv;
c->mul_pair_single = ff_ps_mul_pair_single_rvv;
c->stereo_interpolate[0] = ff_ps_stereo_interpolate_rvv;
}

@ -1,5 +1,5 @@
/*
* Copyright © 2022 Rémi Denis-Courmont.
* Copyright © 2022-2023 Rémi Denis-Courmont.
*
* This file is part of FFmpeg.
*
@ -20,13 +20,16 @@
#include "libavutil/riscv/asm.S"
func ff_ps_add_squares_rvv, zve32f
func ff_ps_add_squares_rvv, zve64f
li t1, 32
1:
vsetvli t0, a2, e32, m4, ta, ma
vlseg2e32.v v24, (a1)
vle64.v v8, (a1)
sub a2, a2, t0
vnsrl.wx v24, v8, zero
vle32.v v16, (a0)
sh3add a1, t0, a1
vnsrl.wx v28, v8, t1
vfmacc.vv v16, v24, v24
vfmacc.vv v16, v28, v28
vse32.v v16, (a0)

Loading…
Cancel
Save