lavc/h264dsp: reduce spills in R-V V idct_add16

release/7.1
Rémi Denis-Courmont 8 months ago
parent 245f76ad74
commit 9135dffd17
  1. 100
      libavcodec/riscv/h264idct_rvv.S

@ -422,9 +422,9 @@ func ff_h264_idct\width\()_dc_add_8_rvv, zve64x
.else .else
vsetivli zero, \width, e8, mf4, ta, ma vsetivli zero, \width, e8, mf4, ta, ma
.endif .endif
lh a3, 0(a1) lh t0, 0(a1)
addi a3, a3, 32 addi t0, t0, 32
srai a3, a3, 6 srai t0, t0, 6
sh zero, 0(a1) sh zero, 0(a1)
.if \width == 8 .if \width == 8
li a6, \width * \width li a6, \width * \width
@ -435,7 +435,7 @@ func ff_h264_idct\width\()_dc_add_8_rvv, zve64x
vsetivli zero, \width * \width, e16, m2, ta, ma vsetivli zero, \width * \width, e16, m2, ta, ma
.endif .endif
vzext.vf2 v0, v24 vzext.vf2 v0, v24
vadd.vx v0, v0, a3 vadd.vx v0, v0, t0
vmax.vx v0, v0, zero vmax.vx v0, v0, zero
.if \width == 8 .if \width == 8
vsetvli zero, zero, e8, m4, ta, ma vsetvli zero, zero, e8, m4, ta, ma
@ -464,33 +464,33 @@ func ff_h264_idct\width\()_dc_add_16_rvv, zve64x, zba
.else .else
vsetivli zero, \width, e16, mf2, ta, ma vsetivli zero, \width, e16, mf2, ta, ma
.endif .endif
lw a3, 0(a1) lw t0, 0(a1)
addi a3, a3, 32 addi t0, t0, 32
srai a3, a3, 6 srai t0, t0, 6
sw zero, 0(a1) sw zero, 0(a1)
add t4, a0, a2 add t1, a0, a2
sh1add t5, a2, a0 sh1add t2, a2, a0
sh1add t6, a2, t4 sh1add t3, a2, t1
.if \width == 8 .if \width == 8
sh2add t0, a2, a0 sh2add t4, a2, a0
sh2add t1, a2, t4 sh2add t5, a2, t1
sh2add t2, a2, t5 sh2add t6, a2, t2
sh2add t3, a2, t6 sh2add a7, a2, t3
.endif .endif
vle16.v v0, (a0) vle16.v v0, (a0)
vle16.v v1, (t4) vle16.v v1, (t1)
vle16.v v2, (t5) vle16.v v2, (t2)
vle16.v v3, (t6) vle16.v v3, (t3)
.if \width == 8 .if \width == 8
vle16.v v4, (t0) vle16.v v4, (t4)
vle16.v v5, (t1) vle16.v v5, (t5)
vle16.v v6, (t2) vle16.v v6, (t6)
vle16.v v7, (t3) vle16.v v7, (a7)
vsetvli a6, zero, e16, m8, ta, ma vsetvli a6, zero, e16, m8, ta, ma
.else .else
vsetvli a6, zero, e16, m4, ta, ma vsetvli a6, zero, e16, m4, ta, ma
.endif .endif
vadd.vx v0, v0, a3 vadd.vx v0, v0, t0
vmax.vx v0, v0, zero vmax.vx v0, v0, zero
vmin.vx v0, v0, a5 vmin.vx v0, v0, a5
.if \width == 8 .if \width == 8
@ -499,14 +499,14 @@ func ff_h264_idct\width\()_dc_add_16_rvv, zve64x, zba
vsetivli zero, \width, e16, mf2, ta, ma vsetivli zero, \width, e16, mf2, ta, ma
.endif .endif
vse16.v v0, (a0) vse16.v v0, (a0)
vse16.v v1, (t4) vse16.v v1, (t1)
vse16.v v2, (t5) vse16.v v2, (t2)
vse16.v v3, (t6) vse16.v v3, (t3)
.if \width == 8 .if \width == 8
vse16.v v4, (t0) vse16.v v4, (t4)
vse16.v v5, (t1) vse16.v v5, (t5)
vse16.v v6, (t2) vse16.v v6, (t6)
vse16.v v7, (t3) vse16.v v7, (a7)
.endif .endif
ret ret
endfunc endfunc
@ -536,17 +536,12 @@ endconst
.macro idct4_adds type, depth .macro idct4_adds type, depth
func ff_h264_idct_add\type\()_\depth\()_rvv, zve32x func ff_h264_idct_add\type\()_\depth\()_rvv, zve32x
csrwi vxrm, 0 csrwi vxrm, 0
addi sp, sp, -64 addi sp, sp, -16
lla t0, ff_h264_scan8 lla t0, ff_h264_scan8
sd s0, (sp) sd s0, (sp)
li t1, 32 * (\depth / 8) li t1, 32 * (\depth / 8)
mv s0, sp mv s0, sp
sd ra, 8(sp) sd ra, 8(sp)
sd s1, 16(sp)
sd s2, 24(sp)
sd s3, 32(sp)
sd s4, 40(sp)
sd s5, 48(sp)
vsetivli zero, 16, e8, m1, ta, ma vsetivli zero, 16, e8, m1, ta, ma
vle8.v v8, (t0) vle8.v v8, (t0)
.if \depth == 8 .if \depth == 8
@ -570,23 +565,23 @@ func ff_h264_idct_add\type\()_\depth\()_rvv, zve32x
vmand.mm v1, v1, v2 vmand.mm v1, v1, v2
.endif .endif
vsetvli zero, zero, e16, m2, ta, ma vsetvli zero, zero, e16, m2, ta, ma
vmv.x.s s2, v0 vmv.x.s a4, v0
vmv.x.s s3, v1 vmv.x.s a7, v1
li s1, 16 mv t4, a0
mv s4, a0 mv t5, a1
mv s5, a1
mv a1, a2 mv a1, a2
mv a2, a3 mv a2, a3
li a3, 16
1: 1:
andi t0, s2, 1 andi t0, a4, 1
addi s1, s1, -1 addi a3, a3, -1
srli s2, s2, 1 srli a4, a4, 1
.ifc \type, 16 .ifc \type, 16
beqz t0, 3f # if (nnz) beqz t0, 3f # if (nnz)
.endif .endif
lw t2, (s5) # block_offset[i] lw t2, (t5) # block_offset[i]
andi t1, s3, 1 andi t1, a7, 1
add a0, s4, t2 add a0, t4, t2
.ifc \type, 16 .ifc \type, 16
bnez t1, 2f # if (nnz == 1 && block[i * 16]) bnez t1, 2f # if (nnz == 1 && block[i * 16])
.else .else
@ -600,19 +595,14 @@ func ff_h264_idct_add\type\()_\depth\()_rvv, zve32x
.endif .endif
jal ff_h264_idct4_dc_add_\depth\()_rvv jal ff_h264_idct4_dc_add_\depth\()_rvv
3: 3:
srli s3, s3, 1 srli a7, a7, 1
addi s5, s5, 4 addi t5, t5, 4
addi a1, a1, 16 * 2 * (\depth / 8) addi a1, a1, 16 * 2 * (\depth / 8)
bnez s1, 1b bnez a3, 1b
ld s5, 48(sp)
ld s4, 40(sp)
ld s3, 32(sp)
ld s2, 24(sp)
ld s1, 16(sp)
ld ra, 8(sp) ld ra, 8(sp)
ld s0, 0(sp) ld s0, 0(sp)
addi sp, sp, 64 addi sp, sp, 16
ret ret
endfunc endfunc
.endm .endm

Loading…
Cancel
Save