lavc/h264dsp: use saturing add/sub for R-V V 8-bit DC add

T-Head C908 (cycles):
h264_idct4_dc_add_8bpp_c:      109.2
h264_idct4_dc_add_8bpp_rvv_i32: 34.5 (before)
h264_idct4_dc_add_8bpp_rvv_i32: 25.5 (after)
h264_idct8_dc_add_8bpp_c:      418.7
h264_idct8_dc_add_8bpp_rvv_i64: 69.5 (before)
h264_idct8_dc_add_8bpp_rvv_i64: 33.5 (after)
release/7.1
Rémi Denis-Courmont 6 months ago
parent 4713a5cc24
commit b0b3bea10b
  1. 5
      libavcodec/riscv/h264dsp_init.c
  2. 28
      libavcodec/riscv/h264idct_rvv.S

@ -98,8 +98,8 @@ av_cold void ff_h264dsp_init_riscv(H264DSPContext *dsp, const int bit_depth,
dsp->h264_idct_add = ff_h264_idct_add_8_rvv;
dsp->h264_idct8_add = ff_h264_idct8_add_8_rvv;
dsp->h264_idct_dc_add = ff_h264_idct4_dc_add_8_rvv;
if (flags & AV_CPU_FLAG_RVB) {
dsp->h264_idct_dc_add = ff_h264_idct4_dc_add_8_rvv;
dsp->h264_idct_add16 = ff_h264_idct_add16_8_rvv;
dsp->h264_idct_add16intra = ff_h264_idct_add16intra_8_rvv;
# if __riscv_xlen == 64
@ -108,7 +108,8 @@ av_cold void ff_h264dsp_init_riscv(H264DSPContext *dsp, const int bit_depth,
}
if (flags & AV_CPU_FLAG_RVV_I64) {
dsp->h264_add_pixels8_clear = ff_h264_add_pixels8_8_rvv;
dsp->h264_idct8_dc_add = ff_h264_idct8_dc_add_8_rvv;
if (flags & AV_CPU_FLAG_RVB)
dsp->h264_idct8_dc_add = ff_h264_idct8_dc_add_8_rvv;
}
dsp->h264_add_pixels4_clear = ff_h264_add_pixels4_8_rvv;
}

@ -420,7 +420,7 @@ endfunc
.endr
.macro idct_dc_add8 width
func ff_h264_idct\width\()_dc_add_8_rvv, zve64x
func ff_h264_idct\width\()_dc_add_8_rvv, zve64x, b
lpad 0
.if \width == 8
vsetivli zero, \width, e8, mf2, ta, ma
@ -428,26 +428,34 @@ func ff_h264_idct\width\()_dc_add_8_rvv, zve64x
vsetivli zero, \width, e8, mf4, ta, ma
.endif
lh t0, 0(a1)
li t1, 255
addi t0, t0, 32
srai t0, t0, 6
sh zero, 0(a1)
.if \width == 8
li a6, \width * \width
vlse64.v v24, (a0), a2
vsetvli zero, a6, e16, m8, ta, ma
vsetvli zero, a6, e8, m4, ta, ma
.else
vlse32.v v24, (a0), a2
vsetivli zero, \width * \width, e16, m2, ta, ma
vsetivli zero, \width * \width, e8, m1, ta, ma
.endif
vzext.vf2 v0, v24
vadd.vx v0, v0, t0
vmax.vx v0, v0, zero
.if \width == 8
vsetvli zero, zero, e8, m4, ta, ma
bgez t0, 1f
neg t0, t0
minu t0, t0, t1
vssubu.vx v24, v24, t0
.if \width == 8
vsetivli zero, \width, e8, mf2, ta, ma
vsse64.v v24, (a0), a2
.else
vsetvli zero, zero, e8, m1, ta, ma
vsetivli zero, \width, e8, mf4, ta, ma
vsse32.v v24, (a0), a2
.endif
vnclipu.wi v24, v0, 0
ret
1:
minu t0, t0, t1
vsaddu.vx v24, v24, t0
.if \width == 8
vsetivli zero, \width, e8, mf2, ta, ma
vsse64.v v24, (a0), a2

Loading…
Cancel
Save