swscale/rgb2rgb2: rework RISC-V V shuffle_bytes_{0321,2103}

This avoids strided loads.

Before:
shuffle_bytes_0321_rvv_i32: 307.7
shuffle_bytes_2103_rvv_i32: 308.7

After:
shuffle_bytes_0321_rvv_i32: 59.7
shuffle_bytes_2103_rvv_i32: 61.5
pull/389/head
Rémi Denis-Courmont 2 years ago
parent d3948e4db5
commit 15982554e6
  1. 45
      libswscale/riscv/rgb2rgb_rvv.S

@ -21,38 +21,33 @@
#include "libavutil/riscv/asm.S"
func ff_shuffle_bytes_0321_rvv, zve32x
addi t1, a0, 3
addi t2, a0, 2
addi t3, a0, 1
li t1, 0x00ff00ff
j 1f
endfunc
func ff_shuffle_bytes_2103_rvv, zve32x
li t1, ~0x00ff00ff
1:
not t2, t1
srai a2, a2, 2
li t4, 4
2:
vsetvli t0, a2, e8, m1, ta, ma
sub a2, a2, t0
vlse8.v v8, (a0), t4
sh2add a0, t0, a0
vlse8.v v9, (t1), t4
sh2add t1, t0, t1
vlse8.v v10, (t2), t4
sh2add t2, t0, t2
vlse8.v v11, (t3), t4
sh2add t3, t0, t3
vsseg4e8.v v8, (a1)
sh2add a1, t0, a1
bnez a2, 2b
vsetvli t0, a2, e32, m8, ta, ma
vle32.v v8, (a0)
sub a2, a2, t0
vand.vx v16, v8, t2
sh2add a0, t0, a0
vand.vx v8, v8, t1
vsrl.vi v24, v16, 16
vsll.vi v16, v16, 16
vor.vv v8, v8, v24
vor.vv v8, v16, v8
vse32.v v8, (a1)
sh2add a1, t0, a1
bnez a2, 2b
ret
endfunc
func ff_shuffle_bytes_2103_rvv, zve32x
addi t1, a0, 1
addi t2, a0, 0
addi t3, a0, 3
addi a0, a0, 2
j 1b
endfunc
func ff_shuffle_bytes_1230_rvv, zve32x
addi t1, a0, 2
addi t2, a0, 3

Loading…
Cancel
Save