lavc/h264dsp: stick R-V V biweight to 16-bit

T-Head C908 (ns):
h264_biweight2_8_c:        2414.5
h264_biweight2_8_rvv_i32:   701.8 (before)
h264_biweight2_8_rvv_i32:   468.5 (after)
h264_biweight4_8_c:        4655.3
h264_biweight4_8_rvv_i32:  1377.5 (before)
h264_biweight4_8_rvv_i32:   931.8 (after)
h264_biweight8_8_c:        9701.5
h264_biweight8_8_rvv_i32:  2896.0 (before)
h264_biweight8_8_rvv_i32:  2070.5 (after)
h264_biweight16_8_c:      18025.0
h264_biweight16_8_rvv_i32: 3460.8 (before)
h264_biweight16_8_rvv_i32: 1978.0 (after)

SpacemiT X60 (ns):
h264_biweight2_8_c:        2415.5
h264_biweight2_8_rvv_i32:   478.2 (before)
h264_biweight2_8_rvv_i32:   362.8 (after)
h264_biweight4_8_c:        4655.3
h264_biweight4_8_rvv_i32:   946.7 (before)
h264_biweight4_8_rvv_i32:   727.3 (after)
h264_biweight8_8_c:        9061.8
h264_biweight8_8_rvv_i32:  2071.7 (before)
h264_biweight8_8_rvv_i32:  1685.8 (after)
h264_biweight16_8_c:      18020.5
h264_biweight16_8_rvv_i32: 3457.2 (before)
h264_biweight16_8_rvv_i32: 1935.8 (after)
release/7.1
Rémi Denis-Courmont 4 months ago
parent 670ff6c7ce
commit afd45c7ff7
  1. 43
      libavcodec/riscv/h264dsp_rvv.S

@ -56,22 +56,21 @@ func ff_h264_biweight_pixels_simple_8_rvv, zve32x
addi a7, a7, 1
ori a7, a7, 1
sll a7, a7, a4
addi a4, a4, 1
1:
vsetvli zero, t6, e32, m4, ta, ma
vsetvli zero, t6, e16, m2, ta, ma
vle8.v v8, (a0)
addi a3, a3, -1
vle8.v v12, (a1)
add a1, a1, a2
vmv.v.x v16, a7
vsetvli zero, zero, e16, m2, ta, ma
vzext.vf2 v24, v8
vzext.vf2 v28, v12
vwmaccsu.vx v16, a5, v24
vwmaccsu.vx v16, a6, v28
vnclip.wx v16, v16, a4
vsetvli zero, zero, e8, m1, ta, ma
vwmaccsu.vx v16, a5, v8
vwmaccsu.vx v16, a6, v12
vsetvli zero, zero, e16, m2, ta, ma
vmax.vx v16, v16, zero
vsetvli zero, zero, e8, m1, ta, ma
vnclipu.wi v8, v16, 1
vnclipu.wx v8, v16, a4
vse8.v v8, (a0)
add a0, a0, a2
bnez a3, 1b
@ -121,33 +120,29 @@ func ff_h264_biweight_pixels_8_rvv, zve32x
addi a7, a7, 1
ori a7, a7, 1
sll a7, a7, a4
addi a4, a4, 1
1:
mv t0, a0
mv t1, a1
mv t5, t6
2:
vsetvli t2, a3, e32, m8, ta, ma
vsetvli t2, a3, e16, m8, ta, ma
vlsseg2e8.v v0, (t0), a2
vlsseg2e8.v v4, (t1), a2
vlsseg2e8.v v8, (t1), a2
addi t5, t5, -2
vmv.v.x v16, a7
vmv.v.x v24, a7
vsetvli zero, zero, e16, m4, ta, ma
vzext.vf2 v8, v0
vzext.vf2 v12, v2
vwmaccsu.vx v16, a5, v8
vwmaccsu.vx v24, a5, v12
vzext.vf2 v8, v4
vzext.vf2 v12, v6
vsetvli zero, zero, e8, m4, ta, ma
vwmaccsu.vx v16, a5, v0
vwmaccsu.vx v24, a5, v4
vwmaccsu.vx v16, a6, v8
vwmaccsu.vx v24, a6, v12
vnclip.wx v8, v16, a4
vnclip.wx v12, v24, a4
vmax.vx v8, v8, zero
vmax.vx v12, v12, zero
vsetvli zero, zero, e8, m2, ta, ma
vnclipu.wi v0, v8, 1
vnclipu.wi v2, v12, 1
vsetvli zero, zero, e16, m8, ta, ma
vmax.vx v16, v16, zero
vmax.vx v24, v24, zero
vsetvli zero, zero, e8, m4, ta, ma
vnclipu.wx v0, v16, a4
vnclipu.wx v4, v24, a4
vssseg2e8.v v0, (t0), a2
addi t0, t0, 2
addi t1, t1, 2

Loading…
Cancel
Save