lavc/h264dsp: optimise R-V V weight for shorter heights

The height is a power of two of up to 16 rows. The current code was
optimised for large sample counts.

T-Head C908:
h264_weight2_8_c:                                      211.7 ( 1.00x)
h264_weight2_8_rvv_i32:                   before       184.0 ( 1.15x)
h264_weight2_8_rvv_i32:                   after         54.2 ( 3.90x)
h264_weight4_8_c:                                      285.7 ( 1.00x)
h264_weight4_8_rvv_i32:                   before       341.2 ( 0.86x)
h264_weight4_8_rvv_i32:                   after         82.2 ( 3.47x)
h264_weight8_8_c:                                      498.7 ( 1.00x)
h264_weight8_8_rvv_i32:                   before       683.7 ( 0.73x)
h264_weight8_8_rvv_i64:                   after        128.5 ( 3.95x)
h264_weight16_8_c:                                     878.2 ( 1.00x)
h264_weight16_8_rvv_i32:                  unchanged    239.5 ( 3.67x)

SpacemiT X60:
h264_weight2_8_c:                                      207.2 ( 1.00x)
h264_weight2_8_rvv_i32:                   before       259.6 ( 0.80x)
h264_weight2_8_rvv_i32:                   after         82.2 ( 2.52x)
h264_weight4_8_c:                                      290.8 ( 1.00x)
h264_weight4_8_rvv_i32:                   before       509.6 ( 0.57x)
h264_weight4_8_rvv_i32:                   after         61.5 ( 4.73x)
h264_weight8_8_c:                                      498.8 ( 1.00x)
h264_weight8_8_rvv_i32:                   before      1019.8 ( 0.49x)
h264_weight8_8_rvv_i64:                   after         71.8 ( 6.95x)
h264_weight16_8_c:                                     874.0 ( 1.00x)
h264_weight16_8_rvv_i32:                  unchanged    249.0 ( 3.51x)
pull/153/merge
Rémi Denis-Courmont 7 months ago
parent ba7d0d5fc3
commit 4936bb2508
  1. 18
      libavcodec/riscv/h264dsp_init.c
  2. 62
      libavcodec/riscv/h264dsp_rvv.S

@ -96,13 +96,23 @@ av_cold void ff_h264dsp_init_riscv(H264DSPContext *dsp, const int bit_depth,
if (flags & AV_CPU_FLAG_RVV_I32) { if (flags & AV_CPU_FLAG_RVV_I32) {
const bool zvl128b = ff_rv_vlen_least(128); const bool zvl128b = ff_rv_vlen_least(128);
if (bit_depth == 8) {
if (zvl128b)
dsp->weight_h264_pixels_tab[0] =
ff_h264_weight_funcs_8_rvv[0].weight;
if (flags & AV_CPU_FLAG_RVV_I64)
dsp->weight_h264_pixels_tab[1] =
ff_h264_weight_funcs_8_rvv[1].weight;
dsp->weight_h264_pixels_tab[2] =
ff_h264_weight_funcs_8_rvv[2].weight;
dsp->weight_h264_pixels_tab[3] =
ff_h264_weight_funcs_8_rvv[3].weight;
}
if (bit_depth == 8 && zvl128b) { if (bit_depth == 8 && zvl128b) {
for (int i = 0; i < 4; i++) { for (int i = 0; i < 4; i++)
dsp->weight_h264_pixels_tab[i] =
ff_h264_weight_funcs_8_rvv[i].weight;
dsp->biweight_h264_pixels_tab[i] = dsp->biweight_h264_pixels_tab[i] =
ff_h264_weight_funcs_8_rvv[i].biweight; ff_h264_weight_funcs_8_rvv[i].biweight;
}
dsp->h264_v_loop_filter_luma = ff_h264_v_loop_filter_luma_8_rvv; dsp->h264_v_loop_filter_luma = ff_h264_v_loop_filter_luma_8_rvv;
dsp->h264_h_loop_filter_luma = ff_h264_h_loop_filter_luma_8_rvv; dsp->h264_h_loop_filter_luma = ff_h264_h_loop_filter_luma_8_rvv;

@ -28,11 +28,12 @@
#include "libavutil/riscv/asm.S" #include "libavutil/riscv/asm.S"
.variant_cc ff_h264_weight_pixels_simple_8_rvv
func ff_h264_weight_pixels_simple_8_rvv, zve32x func ff_h264_weight_pixels_simple_8_rvv, zve32x
csrwi vxrm, 0 csrwi vxrm, 0
sll a5, a5, a3 sll a5, a5, a3
1: 1:
vsetvli zero, a6, e16, m2, ta, ma vsetvli zero, t6, e16, m2, ta, ma
vle8.v v8, (a0) vle8.v v8, (a0)
addi a2, a2, -1 addi a2, a2, -1
vzext.vf2 v24, v8 vzext.vf2 v24, v8
@ -76,38 +77,36 @@ func ff_h264_biweight_pixels_simple_8_rvv, zve32x
ret ret
endfunc endfunc
func ff_h264_weight_pixels_8_rvv, zve32x .macro h264_weight depth, w, b=
func ff_h264_weight_pixels\w\()_\depth\()_rvv, zve64x
lpad 0
.ifb \b
li t6, \w
j ff_h264_weight_pixels_simple_\depth\()_rvv
.else
csrwi vxrm, 0 csrwi vxrm, 0
sll a5, a5, a3 sll a5, a5, a3
1: 1:
mv t0, a0 vsetvli t1, a2, e\b, m2, ta, ma
mv t6, a6 vlse\b\().v v8, (a0), a1
2: vsetvli t0, zero, e16, m4, ta, ma
vsetvli t2, a2, e16, m8, ta, ma vzext.vf2 v24, v8
vlsseg2e8.v v0, (t0), a1 sub a2, a2, t1
addi t6, t6, -2 vmul.vx v16, v24, a4
vzext.vf2 v16, v0 mul t2, t1, a1
vzext.vf2 v24, v4
vmul.vx v16, v16, a4
vmul.vx v24, v24, a4
vsadd.vx v16, v16, a5 vsadd.vx v16, v16, a5
vsadd.vx v24, v24, a5
vmax.vx v16, v16, zero vmax.vx v16, v16, zero
vmax.vx v24, v24, zero vsetvli zero, zero, e8, m2, ta, ma
vsetvli zero, zero, e8, m4, ta, ma vnclipu.wx v8, v16, a3
vnclipu.wx v0, v16, a3 vsetvli zero, t1, e\b, m2, ta, ma
vnclipu.wx v4, v24, a3 vsse\b\().v v8, (a0), a1
vssseg2e8.v v0, (t0), a1 add a0, a0, t2
addi t0, t0, 2
bnez t6, 2b
mul t3, a1, t2
sub a2, a2, t2
add a0, a0, t3
bnez a2, 1b bnez a2, 1b
ret ret
.endif
endfunc endfunc
.endm
.variant_cc ff_h264_biweight_pixels_8_rvv .variant_cc ff_h264_biweight_pixels_8_rvv
func ff_h264_biweight_pixels_8_rvv, zve32x func ff_h264_biweight_pixels_8_rvv, zve32x
@ -152,17 +151,12 @@ func ff_h264_biweight_pixels_8_rvv, zve32x
ret ret
endfunc endfunc
.irp w, 16, 8, 4, 2 h264_weight 8, 2, 16
func ff_h264_weight_pixels\w\()_8_rvv, zve32x h264_weight 8, 4, 32
lpad 0 h264_weight 8, 8, 64
li a6, \w h264_weight 8, 16
.if \w == 16
j ff_h264_weight_pixels_simple_8_rvv
.else
j ff_h264_weight_pixels_8_rvv
.endif
endfunc
.irp w, 16, 8, 4, 2
func ff_h264_biweight_pixels\w\()_8_rvv, zve32x func ff_h264_biweight_pixels\w\()_8_rvv, zve32x
lpad 0 lpad 0
li t6, \w li t6, \w

Loading…
Cancel
Save