aarch64: vp9itxfm: Restructure the idct32 store macros

This avoids concatenation, which can't be used if the whole macro
is wrapped within another macro.

This is also arguably more readable.

This is cherrypicked from libav commit
58d87e0f49.

Signed-off-by: Martin Storsjö <martin@martin.st>
pull/244/merge
Martin Storsjö 8 years ago
parent 31e41350d2
commit 52c7366c83
  1. 80
      libavcodec/aarch64/vp9itxfm_neon.S

@ -935,23 +935,23 @@ function idct32_1d_8x32_pass1_neon
.macro store_rev a, b
// There's no rev128 instruction, but we reverse each 64 bit
// half, and then flip them using an ext with 8 bytes offset.
rev64 v1.8h, v\b\().8h
st1 {v\a\().8h}, [x0], #16
rev64 v0.8h, v\a\().8h
rev64 v1.8h, \b
st1 {\a}, [x0], #16
rev64 v0.8h, \a
ext v1.16b, v1.16b, v1.16b, #8
st1 {v\b\().8h}, [x0], #16
st1 {\b}, [x0], #16
ext v0.16b, v0.16b, v0.16b, #8
st1 {v1.8h}, [x0], #16
st1 {v0.8h}, [x0], #16
.endm
store_rev 16, 24
store_rev 17, 25
store_rev 18, 26
store_rev 19, 27
store_rev 20, 28
store_rev 21, 29
store_rev 22, 30
store_rev 23, 31
store_rev v16.8h, v24.8h
store_rev v17.8h, v25.8h
store_rev v18.8h, v26.8h
store_rev v19.8h, v27.8h
store_rev v20.8h, v28.8h
store_rev v21.8h, v29.8h
store_rev v22.8h, v30.8h
store_rev v23.8h, v31.8h
sub x0, x0, #512
.purgem store_rev
@ -977,14 +977,14 @@ function idct32_1d_8x32_pass1_neon
// subtracted from the output.
.macro store_rev a, b
ld1 {v4.8h}, [x0]
rev64 v1.8h, v\b\().8h
add v4.8h, v4.8h, v\a\().8h
rev64 v0.8h, v\a\().8h
rev64 v1.8h, \b
add v4.8h, v4.8h, \a
rev64 v0.8h, \a
st1 {v4.8h}, [x0], #16
ext v1.16b, v1.16b, v1.16b, #8
ld1 {v5.8h}, [x0]
ext v0.16b, v0.16b, v0.16b, #8
add v5.8h, v5.8h, v\b\().8h
add v5.8h, v5.8h, \b
st1 {v5.8h}, [x0], #16
ld1 {v6.8h}, [x0]
sub v6.8h, v6.8h, v1.8h
@ -994,14 +994,14 @@ function idct32_1d_8x32_pass1_neon
st1 {v7.8h}, [x0], #16
.endm
store_rev 31, 23
store_rev 30, 22
store_rev 29, 21
store_rev 28, 20
store_rev 27, 19
store_rev 26, 18
store_rev 25, 17
store_rev 24, 16
store_rev v31.8h, v23.8h
store_rev v30.8h, v22.8h
store_rev v29.8h, v21.8h
store_rev v28.8h, v20.8h
store_rev v27.8h, v19.8h
store_rev v26.8h, v18.8h
store_rev v25.8h, v17.8h
store_rev v24.8h, v16.8h
.purgem store_rev
ret
endfunc
@ -1047,21 +1047,21 @@ function idct32_1d_8x32_pass2_neon
.if \neg == 0
ld1 {v4.8h}, [x2], x9
ld1 {v5.8h}, [x2], x9
add v4.8h, v4.8h, v\a\().8h
add v4.8h, v4.8h, \a
ld1 {v6.8h}, [x2], x9
add v5.8h, v5.8h, v\b\().8h
add v5.8h, v5.8h, \b
ld1 {v7.8h}, [x2], x9
add v6.8h, v6.8h, v\c\().8h
add v7.8h, v7.8h, v\d\().8h
add v6.8h, v6.8h, \c
add v7.8h, v7.8h, \d
.else
ld1 {v4.8h}, [x2], x7
ld1 {v5.8h}, [x2], x7
sub v4.8h, v4.8h, v\a\().8h
sub v4.8h, v4.8h, \a
ld1 {v6.8h}, [x2], x7
sub v5.8h, v5.8h, v\b\().8h
sub v5.8h, v5.8h, \b
ld1 {v7.8h}, [x2], x7
sub v6.8h, v6.8h, v\c\().8h
sub v7.8h, v7.8h, v\d\().8h
sub v6.8h, v6.8h, \c
sub v7.8h, v7.8h, \d
.endif
ld1 {v0.8b}, [x0], x1
ld1 {v1.8b}, [x0], x1
@ -1085,15 +1085,15 @@ function idct32_1d_8x32_pass2_neon
st1 {v6.8b}, [x0], x1
st1 {v7.8b}, [x0], x1
.endm
load_acc_store 31, 30, 29, 28
load_acc_store 27, 26, 25, 24
load_acc_store 23, 22, 21, 20
load_acc_store 19, 18, 17, 16
load_acc_store v31.8h, v30.8h, v29.8h, v28.8h
load_acc_store v27.8h, v26.8h, v25.8h, v24.8h
load_acc_store v23.8h, v22.8h, v21.8h, v20.8h
load_acc_store v19.8h, v18.8h, v17.8h, v16.8h
sub x2, x2, x9
load_acc_store 16, 17, 18, 19, 1
load_acc_store 20, 21, 22, 23, 1
load_acc_store 24, 25, 26, 27, 1
load_acc_store 28, 29, 30, 31, 1
load_acc_store v16.8h, v17.8h, v18.8h, v19.8h, 1
load_acc_store v20.8h, v21.8h, v22.8h, v23.8h, 1
load_acc_store v24.8h, v25.8h, v26.8h, v27.8h, 1
load_acc_store v28.8h, v29.8h, v30.8h, v31.8h, 1
.purgem load_acc_store
ret
endfunc

Loading…
Cancel
Save