lavc/me_cmp: R-V V sse

C908:
sse_0_c: 614.7
sse_0_rvv_i32: 138.2
sse_1_c: 302.7
sse_1_rvv_i32: 107.2
sse_2_c: 175.7
sse_2_rvv_i32: 104.2

Signed-off-by: Rémi Denis-Courmont <remi@remlab.net>
release/7.0
sunyuechi 12 months ago committed by Rémi Denis-Courmont
parent 37463d7979
commit 9cb8f262f2
  1. 11
      libavcodec/riscv/me_cmp_init.c
  2. 66
      libavcodec/riscv/me_cmp_rvv.S

@ -39,6 +39,13 @@ int ff_pix_abs16_y2_rvv(MpegEncContext *v, const uint8_t *pix1, const uint8_t *p
int ff_pix_abs8_y2_rvv(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
ptrdiff_t stride, int h);
int ff_sse16_rvv(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
ptrdiff_t stride, int h);
int ff_sse8_rvv(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
ptrdiff_t stride, int h);
int ff_sse4_rvv(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
ptrdiff_t stride, int h);
av_cold void ff_me_cmp_init_riscv(MECmpContext *c, AVCodecContext *avctx)
{
#if HAVE_RVV
@ -53,6 +60,10 @@ av_cold void ff_me_cmp_init_riscv(MECmpContext *c, AVCodecContext *avctx)
c->pix_abs[1][1] = ff_pix_abs8_x2_rvv;
c->pix_abs[0][2] = ff_pix_abs16_y2_rvv;
c->pix_abs[1][2] = ff_pix_abs8_y2_rvv;
c->sse[0] = ff_sse16_rvv;
c->sse[1] = ff_sse8_rvv;
c->sse[2] = ff_sse4_rvv;
}
#endif
}

@ -165,3 +165,69 @@ func ff_pix_abs8_y2_rvv, zve32x
pix_abs_ret
endfunc
func ff_sse16_rvv, zve32x
vsetivli t0, 16, e32, m4, ta, ma
vmv.v.x v24, zero
vmv.s.x v0, zero
1:
vsetvli zero, zero, e8, m1, tu, ma
vle8.v v4, (a1)
vle8.v v12, (a2)
addi a4, a4, -1
vwsubu.vv v16, v4, v12
vsetvli zero, zero, e16, m2, tu, ma
vwmacc.vv v24, v16, v16
add a1, a1, a3
add a2, a2, a3
bnez a4, 1b
vsetvli zero, zero, e32, m4, tu, ma
vredsum.vs v0, v24, v0
vmv.x.s a0, v0
ret
endfunc
func ff_sse8_rvv, zve32x
vsetivli t0, 8, e32, m2, ta, ma
vmv.v.x v24, zero
vmv.s.x v0, zero
1:
vsetvli zero, zero, e8, mf2, tu, ma
vle8.v v4, (a1)
vle8.v v12, (a2)
addi a4, a4, -1
vwsubu.vv v16, v4, v12
vsetvli zero, zero, e16, m1, tu, ma
vwmacc.vv v24, v16, v16
add a1, a1, a3
add a2, a2, a3
bnez a4, 1b
vsetvli zero, zero, e32, m2, tu, ma
vredsum.vs v0, v24, v0
vmv.x.s a0, v0
ret
endfunc
func ff_sse4_rvv, zve32x
vsetivli t0, 4, e32, m1, ta, ma
vmv.v.x v24, zero
vmv.s.x v0, zero
1:
vsetvli zero, zero, e8, mf4, tu, ma
vle8.v v4, (a1)
vle8.v v12, (a2)
addi a4, a4, -1
vwsubu.vv v16, v4, v12
vsetvli zero, zero, e16, mf2, tu, ma
vwmacc.vv v24, v16, v16
add a1, a1, a3
add a2, a2, a3
bnez a4, 1b
vsetvli zero, zero, e32, m1, tu, ma
vredsum.vs v0, v24, v0
vmv.x.s a0, v0
ret
endfunc

Loading…
Cancel
Save