lavc/h264dsp: unroll R-V V weight16

As VLSE128.V does not exist, we have no other way to deal with latency.

T-Head C908:
h264_weight16_8_c:                                     989.4 ( 1.00x)
h264_weight16_8_rvv_i32:                               193.2 ( 5.12x)

SpacemiT X60:
h264_weight16_8_c:                                     874.1 ( 1.00x)
h264_weight16_8_rvv_i32:                               196.9 ( 4.44x)
pull/153/merge
Rémi Denis-Courmont 5 months ago
parent 4936bb2508
commit 459a1512f1
  1. 2
      libavcodec/riscv/h264dsp_init.c
  2. 21
      libavcodec/riscv/h264dsp_rvv.S

@ -97,7 +97,7 @@ av_cold void ff_h264dsp_init_riscv(H264DSPContext *dsp, const int bit_depth,
const bool zvl128b = ff_rv_vlen_least(128); const bool zvl128b = ff_rv_vlen_least(128);
if (bit_depth == 8) { if (bit_depth == 8) {
if (zvl128b) if (zvl128b && (flags & AV_CPU_FLAG_RVB))
dsp->weight_h264_pixels_tab[0] = dsp->weight_h264_pixels_tab[0] =
ff_h264_weight_funcs_8_rvv[0].weight; ff_h264_weight_funcs_8_rvv[0].weight;
if (flags & AV_CPU_FLAG_RVV_I64) if (flags & AV_CPU_FLAG_RVV_I64)

@ -29,20 +29,29 @@
#include "libavutil/riscv/asm.S" #include "libavutil/riscv/asm.S"
.variant_cc ff_h264_weight_pixels_simple_8_rvv .variant_cc ff_h264_weight_pixels_simple_8_rvv
func ff_h264_weight_pixels_simple_8_rvv, zve32x func ff_h264_weight_pixels_simple_8_rvv, zve32x, b
csrwi vxrm, 0 csrwi vxrm, 0
sll a5, a5, a3 sll a5, a5, a3
1: 1:
vsetvli zero, t6, e16, m2, ta, ma vsetvli zero, t6, e16, m2, ta, ma
vle8.v v8, (a0) add t0, a0, a1
addi a2, a2, -1 vle8.v v8, (a0)
addi a2, a2, -2
vle8.v v9, (t0)
vzext.vf2 v24, v8 vzext.vf2 v24, v8
vzext.vf2 v26, v9
vmul.vx v16, v24, a4 vmul.vx v16, v24, a4
vmul.vx v18, v26, a4
vsadd.vx v16, v16, a5 vsadd.vx v16, v16, a5
vmax.vx v16, v16, zero vsadd.vx v18, v18, a5
vsetvli zero, zero, e8, m1, ta, ma vmax.vx v16, v16, zero
vmax.vx v18, v18, zero
vsetvli zero, zero, e8, m1, ta, ma
vnclipu.wx v8, v16, a3 vnclipu.wx v8, v16, a3
vse8.v v8, (a0) vnclipu.wx v9, v18, a3
vse8.v v8, (a0)
vse8.v v9, (t0)
sh1add a0, a1, a0
add a0, a0, a1 add a0, a0, a1
bnez a2, 1b bnez a2, 1b

Loading…
Cancel
Save