lavc/h264dsp: optimise R-V V weight for shorter heights

The height is a power of two of up to 16 rows. The current code was
optimised for large sample counts.

T-Head C908:
h264_weight2_8_c:                                      211.7 ( 1.00x)
h264_weight2_8_rvv_i32:                   before       184.0 ( 1.15x)
h264_weight2_8_rvv_i32:                   after         54.2 ( 3.90x)
h264_weight4_8_c:                                      285.7 ( 1.00x)
h264_weight4_8_rvv_i32:                   before       341.2 ( 0.86x)
h264_weight4_8_rvv_i32:                   after         82.2 ( 3.47x)
h264_weight8_8_c:                                      498.7 ( 1.00x)
h264_weight8_8_rvv_i32:                   before       683.7 ( 0.73x)
h264_weight8_8_rvv_i64:                   after        128.5 ( 3.95x)
h264_weight16_8_c:                                     878.2 ( 1.00x)
h264_weight16_8_rvv_i32:                  unchanged    239.5 ( 3.67x)

SpacemiT X60:
h264_weight2_8_c:                                      207.2 ( 1.00x)
h264_weight2_8_rvv_i32:                   before       259.6 ( 0.80x)
h264_weight2_8_rvv_i32:                   after         82.2 ( 2.52x)
h264_weight4_8_c:                                      290.8 ( 1.00x)
h264_weight4_8_rvv_i32:                   before       509.6 ( 0.57x)
h264_weight4_8_rvv_i32:                   after         61.5 ( 4.73x)
h264_weight8_8_c:                                      498.8 ( 1.00x)
h264_weight8_8_rvv_i32:                   before      1019.8 ( 0.49x)
h264_weight8_8_rvv_i64:                   after         71.8 ( 6.95x)
h264_weight16_8_c:                                     874.0 ( 1.00x)
h264_weight16_8_rvv_i32:                  unchanged    249.0 ( 3.51x)
pull/153/merge
Rémi Denis-Courmont 7 months ago
parent ba7d0d5fc3
commit 4936bb2508
  1. 18
      libavcodec/riscv/h264dsp_init.c
  2. 62
      libavcodec/riscv/h264dsp_rvv.S

@ -96,13 +96,23 @@ av_cold void ff_h264dsp_init_riscv(H264DSPContext *dsp, const int bit_depth,
if (flags & AV_CPU_FLAG_RVV_I32) {
const bool zvl128b = ff_rv_vlen_least(128);
if (bit_depth == 8) {
if (zvl128b)
dsp->weight_h264_pixels_tab[0] =
ff_h264_weight_funcs_8_rvv[0].weight;
if (flags & AV_CPU_FLAG_RVV_I64)
dsp->weight_h264_pixels_tab[1] =
ff_h264_weight_funcs_8_rvv[1].weight;
dsp->weight_h264_pixels_tab[2] =
ff_h264_weight_funcs_8_rvv[2].weight;
dsp->weight_h264_pixels_tab[3] =
ff_h264_weight_funcs_8_rvv[3].weight;
}
if (bit_depth == 8 && zvl128b) {
for (int i = 0; i < 4; i++) {
dsp->weight_h264_pixels_tab[i] =
ff_h264_weight_funcs_8_rvv[i].weight;
for (int i = 0; i < 4; i++)
dsp->biweight_h264_pixels_tab[i] =
ff_h264_weight_funcs_8_rvv[i].biweight;
}
dsp->h264_v_loop_filter_luma = ff_h264_v_loop_filter_luma_8_rvv;
dsp->h264_h_loop_filter_luma = ff_h264_h_loop_filter_luma_8_rvv;

@ -28,11 +28,12 @@
#include "libavutil/riscv/asm.S"
.variant_cc ff_h264_weight_pixels_simple_8_rvv
func ff_h264_weight_pixels_simple_8_rvv, zve32x
csrwi vxrm, 0
sll a5, a5, a3
1:
vsetvli zero, a6, e16, m2, ta, ma
vsetvli zero, t6, e16, m2, ta, ma
vle8.v v8, (a0)
addi a2, a2, -1
vzext.vf2 v24, v8
@ -76,38 +77,36 @@ func ff_h264_biweight_pixels_simple_8_rvv, zve32x
ret
endfunc
func ff_h264_weight_pixels_8_rvv, zve32x
.macro h264_weight depth, w, b=
func ff_h264_weight_pixels\w\()_\depth\()_rvv, zve64x
lpad 0
.ifb \b
li t6, \w
j ff_h264_weight_pixels_simple_\depth\()_rvv
.else
csrwi vxrm, 0
sll a5, a5, a3
1:
mv t0, a0
mv t6, a6
2:
vsetvli t2, a2, e16, m8, ta, ma
vlsseg2e8.v v0, (t0), a1
addi t6, t6, -2
vzext.vf2 v16, v0
vzext.vf2 v24, v4
vmul.vx v16, v16, a4
vmul.vx v24, v24, a4
vsetvli t1, a2, e\b, m2, ta, ma
vlse\b\().v v8, (a0), a1
vsetvli t0, zero, e16, m4, ta, ma
vzext.vf2 v24, v8
sub a2, a2, t1
vmul.vx v16, v24, a4
mul t2, t1, a1
vsadd.vx v16, v16, a5
vsadd.vx v24, v24, a5
vmax.vx v16, v16, zero
vmax.vx v24, v24, zero
vsetvli zero, zero, e8, m4, ta, ma
vnclipu.wx v0, v16, a3
vnclipu.wx v4, v24, a3
vssseg2e8.v v0, (t0), a1
addi t0, t0, 2
bnez t6, 2b
mul t3, a1, t2
sub a2, a2, t2
add a0, a0, t3
vsetvli zero, zero, e8, m2, ta, ma
vnclipu.wx v8, v16, a3
vsetvli zero, t1, e\b, m2, ta, ma
vsse\b\().v v8, (a0), a1
add a0, a0, t2
bnez a2, 1b
ret
.endif
endfunc
.endm
.variant_cc ff_h264_biweight_pixels_8_rvv
func ff_h264_biweight_pixels_8_rvv, zve32x
@ -152,17 +151,12 @@ func ff_h264_biweight_pixels_8_rvv, zve32x
ret
endfunc
.irp w, 16, 8, 4, 2
func ff_h264_weight_pixels\w\()_8_rvv, zve32x
lpad 0
li a6, \w
.if \w == 16
j ff_h264_weight_pixels_simple_8_rvv
.else
j ff_h264_weight_pixels_8_rvv
.endif
endfunc
h264_weight 8, 2, 16
h264_weight 8, 4, 32
h264_weight 8, 8, 64
h264_weight 8, 16
.irp w, 16, 8, 4, 2
func ff_h264_biweight_pixels\w\()_8_rvv, zve32x
lpad 0
li t6, \w

Loading…
Cancel
Save