lavc/vp8dsp: R-V V vp8_idct_add

T-Head C908 (cycles):
vp8_idct_add_c:       312.2
vp8_idct_add_rvv_i32: 117.0
release/7.1
Rémi Denis-Courmont 8 months ago
parent e0f4d185f1
commit 658439934b
  1. 2
      libavcodec/riscv/vp8dsp_init.c
  2. 59
      libavcodec/riscv/vp8dsp_rvv.S

@ -27,6 +27,7 @@
#include "vp8dsp.h"
void ff_vp8_luma_dc_wht_rvv(int16_t block[4][4][16], int16_t dc[16]);
void ff_vp8_idct_add_rvv(uint8_t *dst, int16_t block[16], ptrdiff_t stride);
void ff_vp8_idct_dc_add_rvv(uint8_t *dst, int16_t block[16], ptrdiff_t stride);
void ff_vp8_idct_dc_add4y_rvv(uint8_t *dst, int16_t block[4][16], ptrdiff_t stride);
void ff_vp8_idct_dc_add4uv_rvv(uint8_t *dst, int16_t block[4][16], ptrdiff_t stride);
@ -129,6 +130,7 @@ av_cold void ff_vp8dsp_init_riscv(VP8DSPContext *c)
if (flags & AV_CPU_FLAG_RVV_I64)
c->vp8_luma_dc_wht = ff_vp8_luma_dc_wht_rvv;
#endif
c->vp8_idct_add = ff_vp8_idct_add_rvv;
c->vp8_idct_dc_add = ff_vp8_idct_dc_add_rvv;
c->vp8_idct_dc_add4y = ff_vp8_idct_dc_add4y_rvv;
if (flags & AV_CPU_FLAG_RVV_I64)

@ -98,6 +98,65 @@ func ff_vp8_luma_dc_wht_rvv, zve64x
endfunc
#endif
func ff_vp8_idct_add_rvv, zve32x
csrwi vxrm, 0
vsetivli zero, 4, e16, mf2, ta, ma
addi a3, a1, 1 * 4 * 2
addi a4, a1, 2 * 4 * 2
addi a5, a1, 3 * 4 * 2
li t1, 20091
li t2, 35468
jal t0, 1f
vsseg4e16.v v0, (a1)
jal t0, 1f
vlsseg4e8.v v4, (a0), a2
vssra.vi v0, v0, 3
sd zero, (a1)
vssra.vi v1, v1, 3
sd zero, 8(a1)
vssra.vi v2, v2, 3
sd zero, 16(a1)
vssra.vi v3, v3, 3
sd zero, 24(a1)
vsetvli zero, zero, e8, mf4, ta, ma
vwaddu.wv v0, v0, v4
vwaddu.wv v1, v1, v5
vwaddu.wv v2, v2, v6
vwaddu.wv v3, v3, v7
vsetvli zero, zero, e16, mf2, ta, ma
vmax.vx v0, v0, zero
vmax.vx v1, v1, zero
vmax.vx v2, v2, zero
vmax.vx v3, v3, zero
vsetvli zero, zero, e8, mf4, ta, ma
vnclipu.wi v4, v0, 0
vnclipu.wi v5, v1, 0
vnclipu.wi v6, v2, 0
vnclipu.wi v7, v3, 0
vssseg4e8.v v4, (a0), a2
ret
1:
vle16.v v0, (a1)
vle16.v v2, (a4)
vle16.v v1, (a3)
vle16.v v3, (a5)
vadd.vv v4, v0, v2 # t0
vsub.vv v5, v0, v2 # t1
vmulhsu.vx v8, v3, t1
vmulhsu.vx v6, v1, t2
vadd.vv v8, v8, v3
vmulhsu.vx v7, v1, t1
vmulhsu.vx v9, v3, t2
vadd.vv v7, v7, v1
vsub.vv v6, v6, v8 # t2
vadd.vv v7, v7, v9 # t3
vadd.vv v1, v5, v6
vsub.vv v2, v5, v6
vadd.vv v0, v4, v7
vsub.vv v3, v4, v7
jr t0
endfunc
func ff_vp8_idct_dc_add_rvv, zve32x
lh a3, (a1)
addi a3, a3, 4

Loading…
Cancel
Save