lavc/pixblockdsp: rework R-V V get_pixels_unaligned

As in the aligned case, we can use VLSE64.V, though the way of doing so
gets more convoluted, so the performance gains are more modest:

get_pixels_unaligned_c:       126.7
get_pixels_unaligned_rvv_i32: 145.5 (before)
get_pixels_unaligned_rvv_i64:  62.2 (after)

For the reference, those are the aligned benchmarks (unchanged) on the
same T-Head C908 hardware:

get_pixels_c:                 126.7
get_pixels_rvi:                85.7
get_pixels_rvv_i64:            33.2
release/7.0
Rémi Denis-Courmont 1 year ago
parent f68ad5d2de
commit 02594c8c01
  1. 17
      libavcodec/riscv/pixblockdsp_init.c
  2. 28
      libavcodec/riscv/pixblockdsp_rvv.S

@ -56,20 +56,17 @@ av_cold void ff_pixblockdsp_init_riscv(PixblockDSPContext *c,
#if HAVE_RVV
if ((cpu_flags & AV_CPU_FLAG_RVV_I32) && ff_get_rv_vlenb() >= 16) {
if (!high_bit_depth) {
c->get_pixels = ff_get_pixels_unaligned_8_rvv;
c->get_pixels_unaligned = ff_get_pixels_unaligned_8_rvv;
}
c->diff_pixels = ff_diff_pixels_unaligned_rvv;
c->diff_pixels_unaligned = ff_diff_pixels_unaligned_rvv;
}
if (cpu_flags & AV_CPU_FLAG_RVV_I64) {
if (!high_bit_depth)
c->get_pixels = ff_get_pixels_8_rvv;
c->diff_pixels = ff_diff_pixels_rvv;
if ((cpu_flags & AV_CPU_FLAG_RVV_I64) && ff_get_rv_vlenb() >= 16) {
if (!high_bit_depth) {
c->get_pixels = ff_get_pixels_8_rvv;
c->get_pixels_unaligned = ff_get_pixels_unaligned_8_rvv;
}
c->diff_pixels = ff_diff_pixels_rvv;
}
#endif
}

@ -23,6 +23,7 @@
func ff_get_pixels_8_rvv, zve64x
vsetivli zero, 8, e8, mf2, ta, ma
li t0, 8 * 8
1:
vlse64.v v16, (a1), a2
vsetvli zero, t0, e8, m4, ta, ma
vwcvtu.x.x.v v8, v16
@ -30,18 +31,23 @@ func ff_get_pixels_8_rvv, zve64x
ret
endfunc
func ff_get_pixels_unaligned_8_rvv, zve32x
vsetivli zero, 8, e8, mf2, ta, ma
vlsseg8e8.v v16, (a1), a2
func ff_get_pixels_unaligned_8_rvv, zve64x
andi t1, a1, 7
vsetivli zero, 8, e64, m4, ta, ma
li t0, 8 * 8
beqz t1, 1b
andi a1, a1, -8
slli t2, t1, 3
addi t1, a1, 8
sub t3, t0, t2
vlse64.v v16, (a1), a2
vlse64.v v24, (t1), a2
vsrl.vx v16, v16, t2
vsll.vx v24, v24, t3
vor.vv v16, v16, v24
vsetvli zero, t0, e8, m4, ta, ma
vwcvtu.x.x.v v8, v16
vwcvtu.x.x.v v9, v17
vwcvtu.x.x.v v10, v18
vwcvtu.x.x.v v11, v19
vwcvtu.x.x.v v12, v20
vwcvtu.x.x.v v13, v21
vwcvtu.x.x.v v14, v22
vwcvtu.x.x.v v15, v23
vsseg8e16.v v8, (a0)
vse16.v v8, (a0)
ret
endfunc

Loading…
Cancel
Save