lavc/aarch64: reformat add_res funcs

Signed-off-by: J. Dekker <jdek@itanimul.li>
pull/388/head
J. Dekker 2 years ago
parent ea6ecb12aa
commit aa9eabb7a5
  1. 216
      libavcodec/aarch64/hevcdsp_idct_neon.S

@ -27,21 +27,21 @@
#include "libavutil/aarch64/asm.S"
const trans, align=4
.short 64, 83, 64, 36
.short 89, 75, 50, 18
.short 90, 87, 80, 70
.short 57, 43, 25, 9
.short 90, 90, 88, 85
.short 82, 78, 73, 67
.short 61, 54, 46, 38
.short 31, 22, 13, 4
.short 64, 83, 64, 36
.short 89, 75, 50, 18
.short 90, 87, 80, 70
.short 57, 43, 25, 9
.short 90, 90, 88, 85
.short 82, 78, 73, 67
.short 61, 54, 46, 38
.short 31, 22, 13, 4
endconst
.macro clip10 in1, in2, c1, c2
smax \in1, \in1, \c1
smax \in2, \in2, \c1
smin \in1, \in1, \c2
smin \in2, \in2, \c2
smax \in1, \in1, \c1
smax \in2, \in2, \c1
smin \in1, \in1, \c2
smin \in2, \in2, \c2
.endm
function ff_hevc_add_residual_4x4_8_neon, export=1
@ -50,13 +50,13 @@ function ff_hevc_add_residual_4x4_8_neon, export=1
ld1 {v2.s}[1], [x0], x2
ld1 {v2.s}[2], [x0], x2
ld1 {v2.s}[3], [x0], x2
sub x0, x0, x2, lsl #2
uxtl v6.8h, v2.8b
uxtl2 v7.8h, v2.16b
sqadd v0.8h, v0.8h, v6.8h
sqadd v1.8h, v1.8h, v7.8h
sqxtun v0.8b, v0.8h
sqxtun2 v0.16b, v1.8h
sub x0, x0, x2, lsl #2
uxtl v6.8h, v2.8b
uxtl2 v7.8h, v2.16b
sqadd v0.8h, v0.8h, v6.8h
sqadd v1.8h, v1.8h, v7.8h
sqxtun v0.8b, v0.8h
sqxtun2 v0.16b, v1.8h
st1 {v0.s}[0], [x0], x2
st1 {v0.s}[1], [x0], x2
st1 {v0.s}[2], [x0], x2
@ -70,63 +70,63 @@ function ff_hevc_add_residual_4x4_10_neon, export=1
ld1 {v2.d}[0], [x12], x2
ld1 {v2.d}[1], [x12], x2
ld1 {v3.d}[0], [x12], x2
sqadd v0.8h, v0.8h, v2.8h
sqadd v0.8h, v0.8h, v2.8h
ld1 {v3.d}[1], [x12], x2
movi v4.8h, #0
sqadd v1.8h, v1.8h, v3.8h
mvni v5.8h, #0xFC, lsl #8 // movi #0x3FF
clip10 v0.8h, v1.8h, v4.8h, v5.8h
st1 {v0.d}[0], [x0], x2
st1 {v0.d}[1], [x0], x2
st1 {v1.d}[0], [x0], x2
st1 {v1.d}[1], [x0], x2
movi v4.8h, #0
sqadd v1.8h, v1.8h, v3.8h
mvni v5.8h, #0xFC, lsl #8 // movi #0x3FF
clip10 v0.8h, v1.8h, v4.8h, v5.8h
st1 {v0.d}[0], [x0], x2
st1 {v0.d}[1], [x0], x2
st1 {v1.d}[0], [x0], x2
st1 {v1.d}[1], [x0], x2
ret
endfunc
function ff_hevc_add_residual_8x8_8_neon, export=1
add x12, x0, x2
add x2, x2, x2
mov x3, #8
1: subs x3, x3, #2
ld1 {v2.d}[0], [x0]
ld1 {v2.d}[1], [x12]
uxtl v3.8h, v2.8b
add x12, x0, x2
add x2, x2, x2
mov x3, #8
1: subs x3, x3, #2
ld1 {v2.d}[0], [x0]
ld1 {v2.d}[1], [x12]
uxtl v3.8h, v2.8b
ld1 {v0.8h-v1.8h}, [x1], #32
uxtl2 v2.8h, v2.16b
sqadd v0.8h, v0.8h, v3.8h
sqadd v1.8h, v1.8h, v2.8h
sqxtun v0.8b, v0.8h
sqxtun2 v0.16b, v1.8h
st1 {v0.d}[0], [x0], x2
st1 {v0.d}[1], [x12], x2
bne 1b
uxtl2 v2.8h, v2.16b
sqadd v0.8h, v0.8h, v3.8h
sqadd v1.8h, v1.8h, v2.8h
sqxtun v0.8b, v0.8h
sqxtun2 v0.16b, v1.8h
st1 {v0.d}[0], [x0], x2
st1 {v0.d}[1], [x12], x2
bne 1b
ret
endfunc
function ff_hevc_add_residual_8x8_10_neon, export=1
add x12, x0, x2
add x2, x2, x2
mov x3, #8
movi v4.8h, #0
mvni v5.8h, #0xFC, lsl #8 // movi #0x3FF
1: subs x3, x3, #2
add x12, x0, x2
add x2, x2, x2
mov x3, #8
movi v4.8h, #0
mvni v5.8h, #0xFC, lsl #8 // movi #0x3FF
1: subs x3, x3, #2
ld1 {v0.8h-v1.8h}, [x1], #32
ld1 {v2.8h}, [x0]
sqadd v0.8h, v0.8h, v2.8h
ld1 {v3.8h}, [x12]
sqadd v1.8h, v1.8h, v3.8h
clip10 v0.8h, v1.8h, v4.8h, v5.8h
st1 {v0.8h}, [x0], x2
st1 {v1.8h}, [x12], x2
bne 1b
ld1 {v2.8h}, [x0]
sqadd v0.8h, v0.8h, v2.8h
ld1 {v3.8h}, [x12]
sqadd v1.8h, v1.8h, v3.8h
clip10 v0.8h, v1.8h, v4.8h, v5.8h
st1 {v0.8h}, [x0], x2
st1 {v1.8h}, [x12], x2
bne 1b
ret
endfunc
function ff_hevc_add_residual_16x16_8_neon, export=1
mov x3, #16
mov x3, #16
add x12, x0, x2
add x2, x2, x2
1: subs x3, x3, #2
add x2, x2, x2
1: subs x3, x3, #2
ld1 {v16.16b}, [x0]
ld1 {v0.8h-v3.8h}, [x1], #64
ld1 {v19.16b}, [x12]
@ -134,47 +134,47 @@ function ff_hevc_add_residual_16x16_8_neon, export=1
uxtl2 v18.8h, v16.16b
uxtl v20.8h, v19.8b
uxtl2 v21.8h, v19.16b
sqadd v0.8h, v0.8h, v17.8h
sqadd v1.8h, v1.8h, v18.8h
sqadd v2.8h, v2.8h, v20.8h
sqadd v3.8h, v3.8h, v21.8h
sqxtun v0.8b, v0.8h
sqadd v0.8h, v0.8h, v17.8h
sqadd v1.8h, v1.8h, v18.8h
sqadd v2.8h, v2.8h, v20.8h
sqadd v3.8h, v3.8h, v21.8h
sqxtun v0.8b, v0.8h
sqxtun2 v0.16b, v1.8h
sqxtun v1.8b, v2.8h
sqxtun v1.8b, v2.8h
sqxtun2 v1.16b, v3.8h
st1 {v0.16b}, [x0], x2
st1 {v1.16b}, [x12], x2
bne 1b
bne 1b
ret
endfunc
function ff_hevc_add_residual_16x16_10_neon, export=1
mov x3, #16
mov x3, #16
movi v20.8h, #0
mvni v21.8h, #0xFC, lsl #8 // movi #0x3FF
add x12, x0, x2
add x2, x2, x2
1: subs x3, x3, #2
add x2, x2, x2
1: subs x3, x3, #2
ld1 {v16.8h-v17.8h}, [x0]
ld1 {v0.8h-v3.8h}, [x1], #64
sqadd v0.8h, v0.8h, v16.8h
ld1 {v0.8h-v3.8h}, [x1], #64
sqadd v0.8h, v0.8h, v16.8h
ld1 {v18.8h-v19.8h}, [x12]
sqadd v1.8h, v1.8h, v17.8h
sqadd v2.8h, v2.8h, v18.8h
sqadd v3.8h, v3.8h, v19.8h
clip10 v0.8h, v1.8h, v20.8h, v21.8h
clip10 v2.8h, v3.8h, v20.8h, v21.8h
st1 {v0.8h-v1.8h}, [x0], x2
st1 {v2.8h-v3.8h}, [x12], x2
bne 1b
sqadd v1.8h, v1.8h, v17.8h
sqadd v2.8h, v2.8h, v18.8h
sqadd v3.8h, v3.8h, v19.8h
clip10 v0.8h, v1.8h, v20.8h, v21.8h
clip10 v2.8h, v3.8h, v20.8h, v21.8h
st1 {v0.8h-v1.8h}, [x0], x2
st1 {v2.8h-v3.8h}, [x12], x2
bne 1b
ret
endfunc
function ff_hevc_add_residual_32x32_8_neon, export=1
add x12, x0, x2
add x2, x2, x2
mov x3, #32
1: subs x3, x3, #2
add x2, x2, x2
mov x3, #32
1: subs x3, x3, #2
ld1 {v20.16b, v21.16b}, [x0]
uxtl v16.8h, v20.8b
uxtl2 v17.8h, v20.16b
@ -187,43 +187,43 @@ function ff_hevc_add_residual_32x32_8_neon, export=1
uxtl2 v21.8h, v22.16b
uxtl v22.8h, v23.8b
uxtl2 v23.8h, v23.16b
sqadd v0.8h, v0.8h, v16.8h
sqadd v1.8h, v1.8h, v17.8h
sqadd v2.8h, v2.8h, v18.8h
sqadd v3.8h, v3.8h, v19.8h
sqadd v4.8h, v4.8h, v20.8h
sqadd v5.8h, v5.8h, v21.8h
sqadd v6.8h, v6.8h, v22.8h
sqadd v7.8h, v7.8h, v23.8h
sqxtun v0.8b, v0.8h
sqadd v0.8h, v0.8h, v16.8h
sqadd v1.8h, v1.8h, v17.8h
sqadd v2.8h, v2.8h, v18.8h
sqadd v3.8h, v3.8h, v19.8h
sqadd v4.8h, v4.8h, v20.8h
sqadd v5.8h, v5.8h, v21.8h
sqadd v6.8h, v6.8h, v22.8h
sqadd v7.8h, v7.8h, v23.8h
sqxtun v0.8b, v0.8h
sqxtun2 v0.16b, v1.8h
sqxtun v1.8b, v2.8h
sqxtun v1.8b, v2.8h
sqxtun2 v1.16b, v3.8h
sqxtun v2.8b, v4.8h
sqxtun v2.8b, v4.8h
sqxtun2 v2.16b, v5.8h
st1 {v0.16b, v1.16b}, [x0], x2
sqxtun v3.8b, v6.8h
st1 {v0.16b, v1.16b}, [x0], x2
sqxtun v3.8b, v6.8h
sqxtun2 v3.16b, v7.8h
st1 {v2.16b, v3.16b}, [x12], x2
bne 1b
bne 1b
ret
endfunc
function ff_hevc_add_residual_32x32_10_neon, export=1
mov x3, #32
mov x3, #32
movi v20.8h, #0
mvni v21.8h, #0xFC, lsl #8 // movi #0x3FF
1: subs x3, x3, #1
ld1 {v0.8h-v3.8h}, [x1], #64
1: subs x3, x3, #1
ld1 {v0.8h -v3.8h}, [x1], #64
ld1 {v16.8h-v19.8h}, [x0]
sqadd v0.8h, v0.8h, v16.8h
sqadd v1.8h, v1.8h, v17.8h
sqadd v2.8h, v2.8h, v18.8h
sqadd v3.8h, v3.8h, v19.8h
clip10 v0.8h, v1.8h, v20.8h, v21.8h
clip10 v2.8h, v3.8h, v20.8h, v21.8h
st1 {v0.8h-v3.8h}, [x0], x2
bne 1b
sqadd v0.8h, v0.8h, v16.8h
sqadd v1.8h, v1.8h, v17.8h
sqadd v2.8h, v2.8h, v18.8h
sqadd v3.8h, v3.8h, v19.8h
clip10 v0.8h, v1.8h, v20.8h, v21.8h
clip10 v2.8h, v3.8h, v20.8h, v21.8h
st1 {v0.8h-v3.8h}, [x0], x2
bne 1b
ret
endfunc

Loading…
Cancel
Save