|
|
|
/* -*-arm64-*-
|
|
|
|
* vim: syntax=arm64asm
|
|
|
|
*
|
|
|
|
* Copyright (c) 2022 J. Dekker <jdek@itanimul.li>
|
|
|
|
*
|
|
|
|
* This file is part of FFmpeg.
|
|
|
|
*
|
|
|
|
* FFmpeg is free software; you can redistribute it and/or
|
|
|
|
* modify it under the terms of the GNU Lesser General Public
|
|
|
|
* License as published by the Free Software Foundation; either
|
|
|
|
* version 2.1 of the License, or (at your option) any later version.
|
|
|
|
*
|
|
|
|
* FFmpeg is distributed in the hope that it will be useful,
|
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
|
|
* Lesser General Public License for more details.
|
|
|
|
*
|
|
|
|
* You should have received a copy of the GNU Lesser General Public
|
|
|
|
* License along with FFmpeg; if not, write to the Free Software
|
|
|
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include "libavutil/aarch64/asm.S"
|
|
|
|
#define MAX_PB_SIZE 64
|
|
|
|
|
|
|
|
const qpel_filters, align=4
|
|
|
|
.byte 0, 0, 0, 0, 0, 0, 0, 0
|
|
|
|
.byte -1, 4,-10, 58, 17, -5, 1, 0
|
|
|
|
.byte -1, 4,-11, 40, 40,-11, 4, -1
|
|
|
|
.byte 0, 1, -5, 17, 58,-10, 4, -1
|
|
|
|
endconst
|
|
|
|
|
|
|
|
const qpel_filters_abs, align=4
|
|
|
|
.byte 0, 0, 0, 0, 0, 0, 0, 0
|
|
|
|
.byte 1, 4, 10, 58, 17, 5, 1, 0
|
|
|
|
.byte 1, 4, 11, 40, 40, 11, 4, 1
|
|
|
|
.byte 0, 1, 5, 17, 58, 10, 4, 1
|
|
|
|
endconst
|
|
|
|
|
|
|
|
.macro load_filter m
|
|
|
|
movrel x15, qpel_filters
|
|
|
|
add x15, x15, \m, lsl #3
|
|
|
|
ld1 {v0.8b}, [x15]
|
|
|
|
sxtl v0.8h, v0.8b
|
|
|
|
.endm
|
|
|
|
|
|
|
|
.macro load_qpel_filterb freg, xreg
|
|
|
|
movrel \xreg, qpel_filters_abs
|
|
|
|
add \xreg, \xreg, \freg, lsl #3
|
|
|
|
ld4r {v0.16b, v1.16b, v2.16b, v3.16b}, [\xreg], #4
|
|
|
|
ld4r {v4.16b, v5.16b, v6.16b, v7.16b}, [\xreg]
|
|
|
|
.endm
|
|
|
|
|
|
|
|
.macro calc_qpelb dst, src0, src1, src2, src3, src4, src5, src6, src7
|
|
|
|
umull \dst\().8h, \src1\().8b, v1.8b
|
|
|
|
umlsl \dst\().8h, \src0\().8b, v0.8b
|
|
|
|
umlsl \dst\().8h, \src2\().8b, v2.8b
|
|
|
|
umlal \dst\().8h, \src3\().8b, v3.8b
|
|
|
|
umlal \dst\().8h, \src4\().8b, v4.8b
|
|
|
|
umlsl \dst\().8h, \src5\().8b, v5.8b
|
|
|
|
umlal \dst\().8h, \src6\().8b, v6.8b
|
|
|
|
umlsl \dst\().8h, \src7\().8b, v7.8b
|
|
|
|
.endm
|
|
|
|
|
|
|
|
.macro calc_qpelb2 dst, src0, src1, src2, src3, src4, src5, src6, src7
|
|
|
|
umull2 \dst\().8h, \src1\().16b, v1.16b
|
|
|
|
umlsl2 \dst\().8h, \src0\().16b, v0.16b
|
|
|
|
umlsl2 \dst\().8h, \src2\().16b, v2.16b
|
|
|
|
umlal2 \dst\().8h, \src3\().16b, v3.16b
|
|
|
|
umlal2 \dst\().8h, \src4\().16b, v4.16b
|
|
|
|
umlsl2 \dst\().8h, \src5\().16b, v5.16b
|
|
|
|
umlal2 \dst\().8h, \src6\().16b, v6.16b
|
|
|
|
umlsl2 \dst\().8h, \src7\().16b, v7.16b
|
|
|
|
.endm
|
|
|
|
|
|
|
|
.macro load_qpel_filterh freg, xreg
|
|
|
|
movrel \xreg, qpel_filters
|
|
|
|
add \xreg, \xreg, \freg, lsl #3
|
|
|
|
ld1 {v0.8b}, [\xreg]
|
|
|
|
sxtl v0.8h, v0.8b
|
|
|
|
.endm
|
|
|
|
|
|
|
|
.macro calc_qpelh dst, src0, src1, src2, src3, src4, src5, src6, src7, op, shift=6
|
|
|
|
smull \dst\().4s, \src0\().4h, v0.h[0]
|
|
|
|
smlal \dst\().4s, \src1\().4h, v0.h[1]
|
|
|
|
smlal \dst\().4s, \src2\().4h, v0.h[2]
|
|
|
|
smlal \dst\().4s, \src3\().4h, v0.h[3]
|
|
|
|
smlal \dst\().4s, \src4\().4h, v0.h[4]
|
|
|
|
smlal \dst\().4s, \src5\().4h, v0.h[5]
|
|
|
|
smlal \dst\().4s, \src6\().4h, v0.h[6]
|
|
|
|
smlal \dst\().4s, \src7\().4h, v0.h[7]
|
|
|
|
.ifc \op, sshr
|
|
|
|
sshr \dst\().4s, \dst\().4s, \shift
|
|
|
|
.else
|
|
|
|
\op \dst\().4h, \dst\().4s, \shift
|
|
|
|
.endif
|
|
|
|
.endm
|
|
|
|
|
|
|
|
.macro calc_qpelh2 dst, dstt, src0, src1, src2, src3, src4, src5, src6, src7, op, shift=6
|
|
|
|
smull2 \dstt\().4s, \src0\().8h, v0.h[0]
|
|
|
|
smlal2 \dstt\().4s, \src1\().8h, v0.h[1]
|
|
|
|
smlal2 \dstt\().4s, \src2\().8h, v0.h[2]
|
|
|
|
smlal2 \dstt\().4s, \src3\().8h, v0.h[3]
|
|
|
|
smlal2 \dstt\().4s, \src4\().8h, v0.h[4]
|
|
|
|
smlal2 \dstt\().4s, \src5\().8h, v0.h[5]
|
|
|
|
smlal2 \dstt\().4s, \src6\().8h, v0.h[6]
|
|
|
|
smlal2 \dstt\().4s, \src7\().8h, v0.h[7]
|
|
|
|
.ifc \op, sshr
|
|
|
|
sshr \dst\().4s, \dstt\().4s, \shift
|
|
|
|
.else
|
|
|
|
\op \dst\().8h, \dstt\().4s, \shift
|
|
|
|
.endif
|
|
|
|
.endm
|
|
|
|
|
|
|
|
.macro calc_all
|
|
|
|
calc v23, v16, v17, v18, v19, v20, v21, v22, v23
|
|
|
|
b.eq 2f
|
|
|
|
calc v16, v17, v18, v19, v20, v21, v22, v23, v16
|
|
|
|
b.eq 2f
|
|
|
|
calc v17, v18, v19, v20, v21, v22, v23, v16, v17
|
|
|
|
b.eq 2f
|
|
|
|
calc v18, v19, v20, v21, v22, v23, v16, v17, v18
|
|
|
|
b.eq 2f
|
|
|
|
calc v19, v20, v21, v22, v23, v16, v17, v18, v19
|
|
|
|
b.eq 2f
|
|
|
|
calc v20, v21, v22, v23, v16, v17, v18, v19, v20
|
|
|
|
b.eq 2f
|
|
|
|
calc v21, v22, v23, v16, v17, v18, v19, v20, v21
|
|
|
|
b.eq 2f
|
|
|
|
calc v22, v23, v16, v17, v18, v19, v20, v21, v22
|
|
|
|
b.hi 1b
|
|
|
|
.endm
|
|
|
|
|
|
|
|
.macro calc_all2
|
|
|
|
calc v30, v31, v16, v18, v20, v22, v24, v26, v28, v30, v17, v19, v21, v23, v25, v27, v29, v31
|
|
|
|
b.eq 2f
|
|
|
|
calc v16, v17, v18, v20, v22, v24, v26, v28, v30, v16, v19, v21, v23, v25, v27, v29, v31, v17
|
|
|
|
b.eq 2f
|
|
|
|
calc v18, v19, v20, v22, v24, v26, v28, v30, v16, v18, v21, v23, v25, v27, v29, v31, v17, v19
|
|
|
|
b.eq 2f
|
|
|
|
calc v20, v21, v22, v24, v26, v28, v30, v16, v18, v20, v23, v25, v27, v29, v31, v17, v19, v21
|
|
|
|
b.eq 2f
|
|
|
|
calc v22, v23, v24, v26, v28, v30, v16, v18, v20, v22, v25, v27, v29, v31, v17, v19, v21, v23
|
|
|
|
b.eq 2f
|
|
|
|
calc v24, v25, v26, v28, v30, v16, v18, v20, v22, v24, v27, v29, v31, v17, v19, v21, v23, v25
|
|
|
|
b.eq 2f
|
|
|
|
calc v26, v27, v28, v30, v16, v18, v20, v22, v24, v26, v29, v31, v17, v19, v21, v23, v25, v27
|
|
|
|
b.eq 2f
|
|
|
|
calc v28, v29, v30, v16, v18, v20, v22, v24, v26, v28, v31, v17, v19, v21, v23, v25, v27, v29
|
|
|
|
b.hi 1b
|
|
|
|
.endm
|
|
|
|
|
|
|
|
.macro put_hevc type
|
|
|
|
.ifc \type, qpel
|
|
|
|
// void put_hevc_qpel_h(int16_t *dst,
|
|
|
|
// uint8_t *_src, ptrdiff_t _srcstride,
|
|
|
|
// int height, intptr_t mx, intptr_t my, int width)
|
|
|
|
dst .req x0
|
|
|
|
dststride .req x7
|
|
|
|
src .req x1
|
|
|
|
srcstride .req x2
|
|
|
|
height .req x3
|
|
|
|
heightw .req w3
|
|
|
|
mx .req x4
|
|
|
|
width .req w6
|
|
|
|
.endif
|
|
|
|
.ifc \type, qpel_uni
|
|
|
|
// void put_hevc_qpel_uni_h(uint8_t *_dst, ptrdiff_t _dststride,
|
|
|
|
// uint8_t *_src, ptrdiff_t _srcstride,
|
|
|
|
// int height, intptr_t mx, intptr_t my, int width)
|
|
|
|
dst .req x0
|
|
|
|
dststride .req x1
|
|
|
|
src .req x2
|
|
|
|
srcstride .req x3
|
|
|
|
height .req x4
|
|
|
|
heightw .req w4
|
|
|
|
mx .req x5
|
|
|
|
width .req w7
|
|
|
|
.endif
|
|
|
|
.ifc \type, qpel_bi
|
|
|
|
// void put_hevc_qpel_bi_h(uint8_t *_dst, ptrdiff_t _dststride,
|
|
|
|
// uint8_t *_src, ptrdiff_t _srcstride,
|
|
|
|
// int16_t *src2, int height, intptr_t mx,
|
|
|
|
// intptr_t my, int width)
|
|
|
|
dst .req x0
|
|
|
|
dststride .req x1
|
|
|
|
src .req x2
|
|
|
|
srcstride .req x3
|
|
|
|
height .req x5
|
|
|
|
heightw .req w5
|
|
|
|
mx .req x6
|
|
|
|
width .req w8
|
|
|
|
.endif
|
|
|
|
|
|
|
|
.ifc \type, qpel
|
|
|
|
function ff_hevc_put_hevc_h4_8_neon, export=0
|
|
|
|
uxtl v16.8h, v16.8b
|
|
|
|
uxtl v17.8h, v17.8b
|
|
|
|
uxtl v18.8h, v18.8b
|
|
|
|
uxtl v19.8h, v19.8b
|
|
|
|
|
|
|
|
mul v23.4h, v16.4h, v0.h[0]
|
|
|
|
mul v24.4h, v18.4h, v0.h[0]
|
|
|
|
|
|
|
|
.irpc i, 1234567
|
|
|
|
ext v20.16b, v16.16b, v17.16b, #(2*\i)
|
|
|
|
ext v21.16b, v18.16b, v19.16b, #(2*\i)
|
|
|
|
mla v23.4h, v20.4h, v0.h[\i]
|
|
|
|
mla v24.4h, v21.4h, v0.h[\i]
|
|
|
|
.endr
|
|
|
|
ret
|
|
|
|
endfunc
|
|
|
|
.endif
|
|
|
|
|
|
|
|
function ff_hevc_put_hevc_\type\()_h4_8_neon, export=1
|
|
|
|
load_filter mx
|
|
|
|
.ifc \type, qpel_bi
|
|
|
|
mov x16, #(MAX_PB_SIZE << 2) // src2bstridel
|
|
|
|
add x15, x4, #(MAX_PB_SIZE << 1) // src2b
|
|
|
|
.endif
|
|
|
|
sub src, src, #3
|
|
|
|
mov mx, x30
|
|
|
|
.ifc \type, qpel
|
|
|
|
mov dststride, #(MAX_PB_SIZE << 1)
|
|
|
|
lsl x13, srcstride, #1 // srcstridel
|
|
|
|
mov x14, #(MAX_PB_SIZE << 2)
|
|
|
|
.else
|
|
|
|
lsl x14, dststride, #1 // dststridel
|
|
|
|
lsl x13, srcstride, #1 // srcstridel
|
|
|
|
.endif
|
|
|
|
add x10, dst, dststride // dstb
|
|
|
|
add x12, src, srcstride // srcb
|
|
|
|
0: ld1 {v16.8b, v17.8b}, [src], x13
|
|
|
|
ld1 {v18.8b, v19.8b}, [x12], x13
|
|
|
|
.ifc \type, qpel_bi
|
|
|
|
ld1 {v25.8h}, [ x4], x16
|
|
|
|
ld1 {v26.8h}, [x15], x16
|
|
|
|
.endif
|
|
|
|
|
|
|
|
bl ff_hevc_put_hevc_h4_8_neon
|
|
|
|
subs heightw, heightw, #2
|
|
|
|
|
|
|
|
.ifc \type, qpel
|
|
|
|
st1 {v23.4h}, [dst], x14
|
|
|
|
st1 {v24.4h}, [x10], x14
|
|
|
|
.else
|
|
|
|
.ifc \type, qpel_bi
|
|
|
|
sqadd v23.4h, v23.4h, v25.4h
|
|
|
|
sqadd v24.4h, v24.4h, v26.4h
|
|
|
|
sqrshrun v23.8b, v23.8h, #7
|
|
|
|
sqrshrun v24.8b, v24.8h, #7
|
|
|
|
.else
|
|
|
|
sqrshrun v23.8b, v23.8h, #6
|
|
|
|
sqrshrun v24.8b, v24.8h, #6
|
|
|
|
.endif
|
|
|
|
st1 {v23.s}[0], [dst], x14
|
|
|
|
st1 {v24.s}[0], [x10], x14
|
|
|
|
.endif
|
|
|
|
b.gt 0b // double line
|
|
|
|
ret mx
|
|
|
|
endfunc
|
|
|
|
|
|
|
|
.ifc \type, qpel
|
|
|
|
function ff_hevc_put_hevc_h8_8_neon, export=0
|
|
|
|
uxtl v16.8h, v16.8b
|
|
|
|
uxtl v17.8h, v17.8b
|
|
|
|
uxtl v18.8h, v18.8b
|
|
|
|
uxtl v19.8h, v19.8b
|
|
|
|
|
|
|
|
mul v23.8h, v16.8h, v0.h[0]
|
|
|
|
mul v24.8h, v18.8h, v0.h[0]
|
|
|
|
|
|
|
|
.irpc i, 1234567
|
|
|
|
ext v20.16b, v16.16b, v17.16b, #(2*\i)
|
|
|
|
ext v21.16b, v18.16b, v19.16b, #(2*\i)
|
|
|
|
mla v23.8h, v20.8h, v0.h[\i]
|
|
|
|
mla v24.8h, v21.8h, v0.h[\i]
|
|
|
|
.endr
|
|
|
|
ret
|
|
|
|
endfunc
|
|
|
|
.endif
|
|
|
|
|
|
|
|
function ff_hevc_put_hevc_\type\()_h6_8_neon, export=1
|
|
|
|
load_filter mx
|
|
|
|
.ifc \type, qpel_bi
|
|
|
|
mov x16, #(MAX_PB_SIZE << 2) // src2bstridel
|
|
|
|
add x15, x4, #(MAX_PB_SIZE << 1) // src2b
|
|
|
|
.endif
|
|
|
|
sub src, src, #3
|
|
|
|
mov mx, x30
|
|
|
|
.ifc \type, qpel
|
|
|
|
mov dststride, #(MAX_PB_SIZE << 1)
|
|
|
|
lsl x13, srcstride, #1 // srcstridel
|
|
|
|
mov x14, #((MAX_PB_SIZE << 2) - 8)
|
|
|
|
.else
|
|
|
|
lsl x14, dststride, #1 // dststridel
|
|
|
|
lsl x13, srcstride, #1 // srcstridel
|
|
|
|
sub x14, x14, #4
|
|
|
|
.endif
|
|
|
|
add x10, dst, dststride // dstb
|
|
|
|
add x12, src, srcstride // srcb
|
|
|
|
0: ld1 {v16.8b, v17.8b}, [src], x13
|
|
|
|
ld1 {v18.8b, v19.8b}, [x12], x13
|
|
|
|
.ifc \type, qpel_bi
|
|
|
|
ld1 {v25.8h}, [ x4], x16
|
|
|
|
ld1 {v26.8h}, [x15], x16
|
|
|
|
.endif
|
|
|
|
|
|
|
|
bl ff_hevc_put_hevc_h8_8_neon
|
|
|
|
subs heightw, heightw, #2
|
|
|
|
|
|
|
|
.ifc \type, qpel
|
|
|
|
st1 {v23.4h}, [dst], #8
|
|
|
|
st1 {v24.4h}, [x10], #8
|
|
|
|
st1 {v23.s}[2], [dst], x14
|
|
|
|
st1 {v24.s}[2], [x10], x14
|
|
|
|
.else
|
|
|
|
.ifc \type, qpel_bi
|
|
|
|
sqadd v23.8h, v23.8h, v25.8h
|
|
|
|
sqadd v24.8h, v24.8h, v26.8h
|
|
|
|
sqrshrun v23.8b, v23.8h, #7
|
|
|
|
sqrshrun v24.8b, v24.8h, #7
|
|
|
|
.else
|
|
|
|
sqrshrun v23.8b, v23.8h, #6
|
|
|
|
sqrshrun v24.8b, v24.8h, #6
|
|
|
|
.endif
|
|
|
|
st1 {v23.s}[0], [dst], #4
|
|
|
|
st1 {v24.s}[0], [x10], #4
|
|
|
|
st1 {v23.h}[2], [dst], x14
|
|
|
|
st1 {v24.h}[2], [x10], x14
|
|
|
|
.endif
|
|
|
|
b.gt 0b // double line
|
|
|
|
ret mx
|
|
|
|
endfunc
|
|
|
|
|
|
|
|
function ff_hevc_put_hevc_\type\()_h8_8_neon, export=1
|
|
|
|
load_filter mx
|
|
|
|
.ifc \type, qpel_bi
|
|
|
|
mov x16, #(MAX_PB_SIZE << 2) // src2bstridel
|
|
|
|
add x15, x4, #(MAX_PB_SIZE << 1) // src2b
|
|
|
|
.endif
|
|
|
|
sub src, src, #3
|
|
|
|
mov mx, x30
|
|
|
|
.ifc \type, qpel
|
|
|
|
mov dststride, #(MAX_PB_SIZE << 1)
|
|
|
|
lsl x13, srcstride, #1 // srcstridel
|
|
|
|
mov x14, #(MAX_PB_SIZE << 2)
|
|
|
|
.else
|
|
|
|
lsl x14, dststride, #1 // dststridel
|
|
|
|
lsl x13, srcstride, #1 // srcstridel
|
|
|
|
.endif
|
|
|
|
add x10, dst, dststride // dstb
|
|
|
|
add x12, src, srcstride // srcb
|
|
|
|
0: ld1 {v16.8b, v17.8b}, [src], x13
|
|
|
|
ld1 {v18.8b, v19.8b}, [x12], x13
|
|
|
|
.ifc \type, qpel_bi
|
|
|
|
ld1 {v25.8h}, [ x4], x16
|
|
|
|
ld1 {v26.8h}, [x15], x16
|
|
|
|
.endif
|
|
|
|
|
|
|
|
bl ff_hevc_put_hevc_h8_8_neon
|
|
|
|
subs heightw, heightw, #2
|
|
|
|
|
|
|
|
.ifc \type, qpel
|
|
|
|
st1 {v23.8h}, [dst], x14
|
|
|
|
st1 {v24.8h}, [x10], x14
|
|
|
|
.else
|
|
|
|
.ifc \type, qpel_bi
|
|
|
|
sqadd v23.8h, v23.8h, v25.8h
|
|
|
|
sqadd v24.8h, v24.8h, v26.8h
|
|
|
|
sqrshrun v23.8b, v23.8h, #7
|
|
|
|
sqrshrun v24.8b, v24.8h, #7
|
|
|
|
.else
|
|
|
|
sqrshrun v23.8b, v23.8h, #6
|
|
|
|
sqrshrun v24.8b, v24.8h, #6
|
|
|
|
.endif
|
|
|
|
st1 {v23.8b}, [dst], x14
|
|
|
|
st1 {v24.8b}, [x10], x14
|
|
|
|
.endif
|
|
|
|
b.gt 0b // double line
|
|
|
|
ret mx
|
|
|
|
endfunc
|
|
|
|
|
|
|
|
.ifc \type, qpel
|
|
|
|
function ff_hevc_put_hevc_h16_8_neon, export=0
|
|
|
|
uxtl v17.8h, v17.8b
|
|
|
|
uxtl v18.8h, v18.8b
|
|
|
|
|
|
|
|
uxtl v20.8h, v20.8b
|
|
|
|
uxtl v21.8h, v21.8b
|
|
|
|
|
|
|
|
mul v26.8h, v16.8h, v0.h[0]
|
|
|
|
mul v27.8h, v17.8h, v0.h[0]
|
|
|
|
mul v28.8h, v19.8h, v0.h[0]
|
|
|
|
mul v29.8h, v20.8h, v0.h[0]
|
|
|
|
.irpc i, 1234567
|
|
|
|
ext v22.16b, v16.16b, v17.16b, #(2*\i)
|
|
|
|
ext v23.16b, v17.16b, v18.16b, #(2*\i)
|
|
|
|
|
|
|
|
ext v24.16b, v19.16b, v20.16b, #(2*\i)
|
|
|
|
ext v25.16b, v20.16b, v21.16b, #(2*\i)
|
|
|
|
|
|
|
|
mla v26.8h, v22.8h, v0.h[\i]
|
|
|
|
mla v27.8h, v23.8h, v0.h[\i]
|
|
|
|
|
|
|
|
mla v28.8h, v24.8h, v0.h[\i]
|
|
|
|
mla v29.8h, v25.8h, v0.h[\i]
|
|
|
|
.endr
|
|
|
|
ret
|
|
|
|
endfunc
|
|
|
|
.endif
|
|
|
|
|
|
|
|
function ff_hevc_put_hevc_\type\()_h12_8_neon, export=1
|
|
|
|
load_filter mx
|
|
|
|
sxtw height, heightw
|
|
|
|
.ifc \type, qpel_bi
|
|
|
|
ldrh w8, [sp] // width
|
|
|
|
mov x16, #(MAX_PB_SIZE << 2) // src2bstridel
|
|
|
|
lsl x17, height, #7 // src2b reset (height * (MAX_PB_SIZE << 1))
|
|
|
|
add x15, x4, #(MAX_PB_SIZE << 1) // src2b
|
|
|
|
.endif
|
|
|
|
sub src, src, #3
|
|
|
|
mov mx, x30
|
|
|
|
.ifc \type, qpel
|
|
|
|
mov dststride, #(MAX_PB_SIZE << 1)
|
|
|
|
lsl x13, srcstride, #1 // srcstridel
|
|
|
|
mov x14, #((MAX_PB_SIZE << 2) - 16)
|
|
|
|
.else
|
|
|
|
lsl x14, dststride, #1 // dststridel
|
|
|
|
lsl x13, srcstride, #1 // srcstridel
|
|
|
|
sub x14, x14, #8
|
|
|
|
.endif
|
|
|
|
add x10, dst, dststride // dstb
|
|
|
|
add x12, src, srcstride // srcb
|
|
|
|
0: mov x9, height
|
|
|
|
1: ld1 {v16.8b-v18.8b}, [src], x13
|
|
|
|
ld1 {v19.8b-v21.8b}, [x12], x13
|
|
|
|
|
aarch64: hevc: Specialize put_hevc_\type\()_h*_8_neon for horizontal looping
For widths of 32 pixels and more, loop first horizontally,
then vertically.
Previously, this function would process a 16 pixel wide slice
of the block, looping vertically. After processing the whole
height, it would backtrack and process the next 16 pixel wide
slice.
When doing 8tap filtering horizontally, the function must load
7 more pixels (in practice, 8) following the actual inputs, and
this was done for each slice.
By iterating first horizontally throughout each line, then
vertically, we access data in a more cache friendly order, and
we don't need to reload data unnecessarily.
Keep the original order in put_hevc_\type\()_h12_8_neon; the
only suboptimal case there is for width=24. But specializing
an optimal variant for that would require more code, which
might not be worth it.
For the h16 case, this implementation would give a slowdown,
as it now loads the first 8 pixels separately from the rest, but
for larger widths, it is a gain. Therefore, keep the h16 case
as it was (but remove the outer loop), and create a new specialized
version for horizontal looping with 16 pixels at a time.
Before: Cortex A53 A72 A73 Graviton 3
put_hevc_qpel_h16_8_neon: 710.5 667.7 692.5 211.0
put_hevc_qpel_h32_8_neon: 2791.5 2643.5 2732.0 883.5
put_hevc_qpel_h64_8_neon: 10954.0 10657.0 10874.2 3241.5
After:
put_hevc_qpel_h16_8_neon: 697.5 663.5 705.7 212.5
put_hevc_qpel_h32_8_neon: 2767.2 2684.5 2791.2 920.5
put_hevc_qpel_h64_8_neon: 10559.2 10471.5 10932.2 3051.7
Signed-off-by: Martin Storsjö <martin@martin.st>
10 months ago
|
|
|
uxtl v16.8h, v16.8b
|
|
|
|
uxtl v19.8h, v19.8b
|
|
|
|
bl ff_hevc_put_hevc_h16_8_neon
|
aarch64: hevc: Specialize put_hevc_\type\()_h*_8_neon for horizontal looping
For widths of 32 pixels and more, loop first horizontally,
then vertically.
Previously, this function would process a 16 pixel wide slice
of the block, looping vertically. After processing the whole
height, it would backtrack and process the next 16 pixel wide
slice.
When doing 8tap filtering horizontally, the function must load
7 more pixels (in practice, 8) following the actual inputs, and
this was done for each slice.
By iterating first horizontally throughout each line, then
vertically, we access data in a more cache friendly order, and
we don't need to reload data unnecessarily.
Keep the original order in put_hevc_\type\()_h12_8_neon; the
only suboptimal case there is for width=24. But specializing
an optimal variant for that would require more code, which
might not be worth it.
For the h16 case, this implementation would give a slowdown,
as it now loads the first 8 pixels separately from the rest, but
for larger widths, it is a gain. Therefore, keep the h16 case
as it was (but remove the outer loop), and create a new specialized
version for horizontal looping with 16 pixels at a time.
Before: Cortex A53 A72 A73 Graviton 3
put_hevc_qpel_h16_8_neon: 710.5 667.7 692.5 211.0
put_hevc_qpel_h32_8_neon: 2791.5 2643.5 2732.0 883.5
put_hevc_qpel_h64_8_neon: 10954.0 10657.0 10874.2 3241.5
After:
put_hevc_qpel_h16_8_neon: 697.5 663.5 705.7 212.5
put_hevc_qpel_h32_8_neon: 2767.2 2684.5 2791.2 920.5
put_hevc_qpel_h64_8_neon: 10559.2 10471.5 10932.2 3051.7
Signed-off-by: Martin Storsjö <martin@martin.st>
10 months ago
|
|
|
subs x9, x9, #2
|
|
|
|
|
|
|
|
.ifc \type, qpel
|
|
|
|
st1 {v26.8h}, [dst], #16
|
|
|
|
st1 {v28.8h}, [x10], #16
|
|
|
|
st1 {v27.4h}, [dst], x14
|
|
|
|
st1 {v29.4h}, [x10], x14
|
|
|
|
.else
|
|
|
|
.ifc \type, qpel_bi
|
|
|
|
ld1 {v16.8h, v17.8h}, [ x4], x16
|
|
|
|
ld1 {v18.8h, v19.8h}, [x15], x16
|
|
|
|
sqadd v26.8h, v26.8h, v16.8h
|
|
|
|
sqadd v27.8h, v27.8h, v17.8h
|
|
|
|
sqadd v28.8h, v28.8h, v18.8h
|
|
|
|
sqadd v29.8h, v29.8h, v19.8h
|
|
|
|
sqrshrun v26.8b, v26.8h, #7
|
|
|
|
sqrshrun v27.8b, v27.8h, #7
|
|
|
|
sqrshrun v28.8b, v28.8h, #7
|
|
|
|
sqrshrun v29.8b, v29.8h, #7
|
|
|
|
.else
|
|
|
|
sqrshrun v26.8b, v26.8h, #6
|
|
|
|
sqrshrun v27.8b, v27.8h, #6
|
|
|
|
sqrshrun v28.8b, v28.8h, #6
|
|
|
|
sqrshrun v29.8b, v29.8h, #6
|
|
|
|
.endif
|
|
|
|
st1 {v26.8b}, [dst], #8
|
|
|
|
st1 {v28.8b}, [x10], #8
|
|
|
|
st1 {v27.s}[0], [dst], x14
|
|
|
|
st1 {v29.s}[0], [x10], x14
|
|
|
|
.endif
|
|
|
|
b.gt 1b // double line
|
|
|
|
subs width, width, #12
|
|
|
|
// reset src
|
|
|
|
msub src, srcstride, height, src
|
|
|
|
msub x12, srcstride, height, x12
|
|
|
|
// reset dst
|
|
|
|
msub dst, dststride, height, dst
|
|
|
|
msub x10, dststride, height, x10
|
|
|
|
.ifc \type, qpel_bi
|
|
|
|
// reset xsrc
|
|
|
|
sub x4, x4, x17
|
|
|
|
sub x15, x15, x17
|
|
|
|
add x4, x4, #24
|
|
|
|
add x15, x15, #24
|
|
|
|
.endif
|
|
|
|
add src, src, #12
|
|
|
|
add x12, x12, #12
|
|
|
|
.ifc \type, qpel
|
|
|
|
add dst, dst, #24
|
|
|
|
add x10, x10, #24
|
|
|
|
.else
|
|
|
|
add dst, dst, #12
|
|
|
|
add x10, x10, #12
|
|
|
|
.endif
|
|
|
|
b.gt 0b
|
|
|
|
ret mx
|
|
|
|
endfunc
|
|
|
|
|
|
|
|
function ff_hevc_put_hevc_\type\()_h16_8_neon, export=1
|
|
|
|
load_filter mx
|
|
|
|
sxtw height, heightw
|
|
|
|
mov mx, x30
|
|
|
|
.ifc \type, qpel_bi
|
|
|
|
ldrh w8, [sp] // width
|
|
|
|
mov x16, #(MAX_PB_SIZE << 2) // src2bstridel
|
|
|
|
add x15, x4, #(MAX_PB_SIZE << 1) // src2b
|
|
|
|
.endif
|
|
|
|
sub src, src, #3
|
|
|
|
mov mx, x30
|
|
|
|
.ifc \type, qpel
|
|
|
|
mov dststride, #(MAX_PB_SIZE << 1)
|
|
|
|
lsl x13, srcstride, #1 // srcstridel
|
|
|
|
mov x14, #(MAX_PB_SIZE << 2)
|
|
|
|
.else
|
|
|
|
lsl x14, dststride, #1 // dststridel
|
|
|
|
lsl x13, srcstride, #1 // srcstridel
|
|
|
|
.endif
|
|
|
|
add x10, dst, dststride // dstb
|
|
|
|
add x12, src, srcstride // srcb
|
aarch64: hevc: Specialize put_hevc_\type\()_h*_8_neon for horizontal looping
For widths of 32 pixels and more, loop first horizontally,
then vertically.
Previously, this function would process a 16 pixel wide slice
of the block, looping vertically. After processing the whole
height, it would backtrack and process the next 16 pixel wide
slice.
When doing 8tap filtering horizontally, the function must load
7 more pixels (in practice, 8) following the actual inputs, and
this was done for each slice.
By iterating first horizontally throughout each line, then
vertically, we access data in a more cache friendly order, and
we don't need to reload data unnecessarily.
Keep the original order in put_hevc_\type\()_h12_8_neon; the
only suboptimal case there is for width=24. But specializing
an optimal variant for that would require more code, which
might not be worth it.
For the h16 case, this implementation would give a slowdown,
as it now loads the first 8 pixels separately from the rest, but
for larger widths, it is a gain. Therefore, keep the h16 case
as it was (but remove the outer loop), and create a new specialized
version for horizontal looping with 16 pixels at a time.
Before: Cortex A53 A72 A73 Graviton 3
put_hevc_qpel_h16_8_neon: 710.5 667.7 692.5 211.0
put_hevc_qpel_h32_8_neon: 2791.5 2643.5 2732.0 883.5
put_hevc_qpel_h64_8_neon: 10954.0 10657.0 10874.2 3241.5
After:
put_hevc_qpel_h16_8_neon: 697.5 663.5 705.7 212.5
put_hevc_qpel_h32_8_neon: 2767.2 2684.5 2791.2 920.5
put_hevc_qpel_h64_8_neon: 10559.2 10471.5 10932.2 3051.7
Signed-off-by: Martin Storsjö <martin@martin.st>
10 months ago
|
|
|
|
|
|
|
1: ld1 {v16.8b-v18.8b}, [src], x13
|
|
|
|
ld1 {v19.8b-v21.8b}, [x12], x13
|
|
|
|
|
aarch64: hevc: Specialize put_hevc_\type\()_h*_8_neon for horizontal looping
For widths of 32 pixels and more, loop first horizontally,
then vertically.
Previously, this function would process a 16 pixel wide slice
of the block, looping vertically. After processing the whole
height, it would backtrack and process the next 16 pixel wide
slice.
When doing 8tap filtering horizontally, the function must load
7 more pixels (in practice, 8) following the actual inputs, and
this was done for each slice.
By iterating first horizontally throughout each line, then
vertically, we access data in a more cache friendly order, and
we don't need to reload data unnecessarily.
Keep the original order in put_hevc_\type\()_h12_8_neon; the
only suboptimal case there is for width=24. But specializing
an optimal variant for that would require more code, which
might not be worth it.
For the h16 case, this implementation would give a slowdown,
as it now loads the first 8 pixels separately from the rest, but
for larger widths, it is a gain. Therefore, keep the h16 case
as it was (but remove the outer loop), and create a new specialized
version for horizontal looping with 16 pixels at a time.
Before: Cortex A53 A72 A73 Graviton 3
put_hevc_qpel_h16_8_neon: 710.5 667.7 692.5 211.0
put_hevc_qpel_h32_8_neon: 2791.5 2643.5 2732.0 883.5
put_hevc_qpel_h64_8_neon: 10954.0 10657.0 10874.2 3241.5
After:
put_hevc_qpel_h16_8_neon: 697.5 663.5 705.7 212.5
put_hevc_qpel_h32_8_neon: 2767.2 2684.5 2791.2 920.5
put_hevc_qpel_h64_8_neon: 10559.2 10471.5 10932.2 3051.7
Signed-off-by: Martin Storsjö <martin@martin.st>
10 months ago
|
|
|
uxtl v16.8h, v16.8b
|
|
|
|
uxtl v19.8h, v19.8b
|
|
|
|
bl ff_hevc_put_hevc_h16_8_neon
|
aarch64: hevc: Specialize put_hevc_\type\()_h*_8_neon for horizontal looping
For widths of 32 pixels and more, loop first horizontally,
then vertically.
Previously, this function would process a 16 pixel wide slice
of the block, looping vertically. After processing the whole
height, it would backtrack and process the next 16 pixel wide
slice.
When doing 8tap filtering horizontally, the function must load
7 more pixels (in practice, 8) following the actual inputs, and
this was done for each slice.
By iterating first horizontally throughout each line, then
vertically, we access data in a more cache friendly order, and
we don't need to reload data unnecessarily.
Keep the original order in put_hevc_\type\()_h12_8_neon; the
only suboptimal case there is for width=24. But specializing
an optimal variant for that would require more code, which
might not be worth it.
For the h16 case, this implementation would give a slowdown,
as it now loads the first 8 pixels separately from the rest, but
for larger widths, it is a gain. Therefore, keep the h16 case
as it was (but remove the outer loop), and create a new specialized
version for horizontal looping with 16 pixels at a time.
Before: Cortex A53 A72 A73 Graviton 3
put_hevc_qpel_h16_8_neon: 710.5 667.7 692.5 211.0
put_hevc_qpel_h32_8_neon: 2791.5 2643.5 2732.0 883.5
put_hevc_qpel_h64_8_neon: 10954.0 10657.0 10874.2 3241.5
After:
put_hevc_qpel_h16_8_neon: 697.5 663.5 705.7 212.5
put_hevc_qpel_h32_8_neon: 2767.2 2684.5 2791.2 920.5
put_hevc_qpel_h64_8_neon: 10559.2 10471.5 10932.2 3051.7
Signed-off-by: Martin Storsjö <martin@martin.st>
10 months ago
|
|
|
subs height, height, #2
|
|
|
|
|
|
|
|
.ifc \type, qpel
|
|
|
|
st1 {v26.8h, v27.8h}, [dst], x14
|
|
|
|
st1 {v28.8h, v29.8h}, [x10], x14
|
|
|
|
.else
|
|
|
|
.ifc \type, qpel_bi
|
|
|
|
ld1 {v16.8h, v17.8h}, [ x4], x16
|
|
|
|
ld1 {v18.8h, v19.8h}, [x15], x16
|
|
|
|
sqadd v26.8h, v26.8h, v16.8h
|
|
|
|
sqadd v27.8h, v27.8h, v17.8h
|
|
|
|
sqadd v28.8h, v28.8h, v18.8h
|
|
|
|
sqadd v29.8h, v29.8h, v19.8h
|
|
|
|
sqrshrun v26.8b, v26.8h, #7
|
|
|
|
sqrshrun v27.8b, v27.8h, #7
|
|
|
|
sqrshrun v28.8b, v28.8h, #7
|
|
|
|
sqrshrun v29.8b, v29.8h, #7
|
|
|
|
.else
|
|
|
|
sqrshrun v26.8b, v26.8h, #6
|
|
|
|
sqrshrun v27.8b, v27.8h, #6
|
|
|
|
sqrshrun v28.8b, v28.8h, #6
|
|
|
|
sqrshrun v29.8b, v29.8h, #6
|
|
|
|
.endif
|
|
|
|
st1 {v26.8b, v27.8b}, [dst], x14
|
|
|
|
st1 {v28.8b, v29.8b}, [x10], x14
|
|
|
|
.endif
|
|
|
|
b.gt 1b // double line
|
aarch64: hevc: Specialize put_hevc_\type\()_h*_8_neon for horizontal looping
For widths of 32 pixels and more, loop first horizontally,
then vertically.
Previously, this function would process a 16 pixel wide slice
of the block, looping vertically. After processing the whole
height, it would backtrack and process the next 16 pixel wide
slice.
When doing 8tap filtering horizontally, the function must load
7 more pixels (in practice, 8) following the actual inputs, and
this was done for each slice.
By iterating first horizontally throughout each line, then
vertically, we access data in a more cache friendly order, and
we don't need to reload data unnecessarily.
Keep the original order in put_hevc_\type\()_h12_8_neon; the
only suboptimal case there is for width=24. But specializing
an optimal variant for that would require more code, which
might not be worth it.
For the h16 case, this implementation would give a slowdown,
as it now loads the first 8 pixels separately from the rest, but
for larger widths, it is a gain. Therefore, keep the h16 case
as it was (but remove the outer loop), and create a new specialized
version for horizontal looping with 16 pixels at a time.
Before: Cortex A53 A72 A73 Graviton 3
put_hevc_qpel_h16_8_neon: 710.5 667.7 692.5 211.0
put_hevc_qpel_h32_8_neon: 2791.5 2643.5 2732.0 883.5
put_hevc_qpel_h64_8_neon: 10954.0 10657.0 10874.2 3241.5
After:
put_hevc_qpel_h16_8_neon: 697.5 663.5 705.7 212.5
put_hevc_qpel_h32_8_neon: 2767.2 2684.5 2791.2 920.5
put_hevc_qpel_h64_8_neon: 10559.2 10471.5 10932.2 3051.7
Signed-off-by: Martin Storsjö <martin@martin.st>
10 months ago
|
|
|
ret mx
|
|
|
|
endfunc
|
|
|
|
|
|
|
|
function ff_hevc_put_hevc_\type\()_h32_8_neon, export=1
|
|
|
|
load_filter mx
|
|
|
|
sxtw height, heightw
|
|
|
|
mov mx, x30
|
|
|
|
.ifc \type, qpel_bi
|
aarch64: hevc: Specialize put_hevc_\type\()_h*_8_neon for horizontal looping
For widths of 32 pixels and more, loop first horizontally,
then vertically.
Previously, this function would process a 16 pixel wide slice
of the block, looping vertically. After processing the whole
height, it would backtrack and process the next 16 pixel wide
slice.
When doing 8tap filtering horizontally, the function must load
7 more pixels (in practice, 8) following the actual inputs, and
this was done for each slice.
By iterating first horizontally throughout each line, then
vertically, we access data in a more cache friendly order, and
we don't need to reload data unnecessarily.
Keep the original order in put_hevc_\type\()_h12_8_neon; the
only suboptimal case there is for width=24. But specializing
an optimal variant for that would require more code, which
might not be worth it.
For the h16 case, this implementation would give a slowdown,
as it now loads the first 8 pixels separately from the rest, but
for larger widths, it is a gain. Therefore, keep the h16 case
as it was (but remove the outer loop), and create a new specialized
version for horizontal looping with 16 pixels at a time.
Before: Cortex A53 A72 A73 Graviton 3
put_hevc_qpel_h16_8_neon: 710.5 667.7 692.5 211.0
put_hevc_qpel_h32_8_neon: 2791.5 2643.5 2732.0 883.5
put_hevc_qpel_h64_8_neon: 10954.0 10657.0 10874.2 3241.5
After:
put_hevc_qpel_h16_8_neon: 697.5 663.5 705.7 212.5
put_hevc_qpel_h32_8_neon: 2767.2 2684.5 2791.2 920.5
put_hevc_qpel_h64_8_neon: 10559.2 10471.5 10932.2 3051.7
Signed-off-by: Martin Storsjö <martin@martin.st>
10 months ago
|
|
|
ldrh w8, [sp] // width
|
|
|
|
mov x16, #(MAX_PB_SIZE << 2) // src2bstridel
|
|
|
|
lsl x17, x5, #7 // src2b reset
|
|
|
|
add x15, x4, #(MAX_PB_SIZE << 1) // src2b
|
|
|
|
sub x16, x16, width, uxtw #1
|
|
|
|
.endif
|
aarch64: hevc: Specialize put_hevc_\type\()_h*_8_neon for horizontal looping
For widths of 32 pixels and more, loop first horizontally,
then vertically.
Previously, this function would process a 16 pixel wide slice
of the block, looping vertically. After processing the whole
height, it would backtrack and process the next 16 pixel wide
slice.
When doing 8tap filtering horizontally, the function must load
7 more pixels (in practice, 8) following the actual inputs, and
this was done for each slice.
By iterating first horizontally throughout each line, then
vertically, we access data in a more cache friendly order, and
we don't need to reload data unnecessarily.
Keep the original order in put_hevc_\type\()_h12_8_neon; the
only suboptimal case there is for width=24. But specializing
an optimal variant for that would require more code, which
might not be worth it.
For the h16 case, this implementation would give a slowdown,
as it now loads the first 8 pixels separately from the rest, but
for larger widths, it is a gain. Therefore, keep the h16 case
as it was (but remove the outer loop), and create a new specialized
version for horizontal looping with 16 pixels at a time.
Before: Cortex A53 A72 A73 Graviton 3
put_hevc_qpel_h16_8_neon: 710.5 667.7 692.5 211.0
put_hevc_qpel_h32_8_neon: 2791.5 2643.5 2732.0 883.5
put_hevc_qpel_h64_8_neon: 10954.0 10657.0 10874.2 3241.5
After:
put_hevc_qpel_h16_8_neon: 697.5 663.5 705.7 212.5
put_hevc_qpel_h32_8_neon: 2767.2 2684.5 2791.2 920.5
put_hevc_qpel_h64_8_neon: 10559.2 10471.5 10932.2 3051.7
Signed-off-by: Martin Storsjö <martin@martin.st>
10 months ago
|
|
|
sub src, src, #3
|
|
|
|
mov mx, x30
|
|
|
|
.ifc \type, qpel
|
|
|
|
mov dststride, #(MAX_PB_SIZE << 1)
|
|
|
|
lsl x13, srcstride, #1 // srcstridel
|
|
|
|
mov x14, #(MAX_PB_SIZE << 2)
|
|
|
|
sub x14, x14, width, uxtw #1
|
|
|
|
.else
|
|
|
|
lsl x14, dststride, #1 // dststridel
|
|
|
|
lsl x13, srcstride, #1 // srcstridel
|
|
|
|
sub x14, x14, width, uxtw
|
|
|
|
.endif
|
|
|
|
sub x13, x13, width, uxtw
|
|
|
|
sub x13, x13, #8
|
|
|
|
add x10, dst, dststride // dstb
|
|
|
|
add x12, src, srcstride // srcb
|
|
|
|
0: mov w9, width
|
|
|
|
ld1 {v16.8b}, [src], #8
|
|
|
|
ld1 {v19.8b}, [x12], #8
|
|
|
|
uxtl v16.8h, v16.8b
|
|
|
|
uxtl v19.8h, v19.8b
|
|
|
|
1:
|
|
|
|
ld1 {v17.8b-v18.8b}, [src], #16
|
|
|
|
ld1 {v20.8b-v21.8b}, [x12], #16
|
|
|
|
|
|
|
|
bl ff_hevc_put_hevc_h16_8_neon
|
|
|
|
subs w9, w9, #16
|
|
|
|
|
|
|
|
mov v16.16b, v18.16b
|
|
|
|
mov v19.16b, v21.16b
|
|
|
|
.ifc \type, qpel
|
aarch64: hevc: Specialize put_hevc_\type\()_h*_8_neon for horizontal looping
For widths of 32 pixels and more, loop first horizontally,
then vertically.
Previously, this function would process a 16 pixel wide slice
of the block, looping vertically. After processing the whole
height, it would backtrack and process the next 16 pixel wide
slice.
When doing 8tap filtering horizontally, the function must load
7 more pixels (in practice, 8) following the actual inputs, and
this was done for each slice.
By iterating first horizontally throughout each line, then
vertically, we access data in a more cache friendly order, and
we don't need to reload data unnecessarily.
Keep the original order in put_hevc_\type\()_h12_8_neon; the
only suboptimal case there is for width=24. But specializing
an optimal variant for that would require more code, which
might not be worth it.
For the h16 case, this implementation would give a slowdown,
as it now loads the first 8 pixels separately from the rest, but
for larger widths, it is a gain. Therefore, keep the h16 case
as it was (but remove the outer loop), and create a new specialized
version for horizontal looping with 16 pixels at a time.
Before: Cortex A53 A72 A73 Graviton 3
put_hevc_qpel_h16_8_neon: 710.5 667.7 692.5 211.0
put_hevc_qpel_h32_8_neon: 2791.5 2643.5 2732.0 883.5
put_hevc_qpel_h64_8_neon: 10954.0 10657.0 10874.2 3241.5
After:
put_hevc_qpel_h16_8_neon: 697.5 663.5 705.7 212.5
put_hevc_qpel_h32_8_neon: 2767.2 2684.5 2791.2 920.5
put_hevc_qpel_h64_8_neon: 10559.2 10471.5 10932.2 3051.7
Signed-off-by: Martin Storsjö <martin@martin.st>
10 months ago
|
|
|
st1 {v26.8h, v27.8h}, [dst], #32
|
|
|
|
st1 {v28.8h, v29.8h}, [x10], #32
|
|
|
|
.else
|
aarch64: hevc: Specialize put_hevc_\type\()_h*_8_neon for horizontal looping
For widths of 32 pixels and more, loop first horizontally,
then vertically.
Previously, this function would process a 16 pixel wide slice
of the block, looping vertically. After processing the whole
height, it would backtrack and process the next 16 pixel wide
slice.
When doing 8tap filtering horizontally, the function must load
7 more pixels (in practice, 8) following the actual inputs, and
this was done for each slice.
By iterating first horizontally throughout each line, then
vertically, we access data in a more cache friendly order, and
we don't need to reload data unnecessarily.
Keep the original order in put_hevc_\type\()_h12_8_neon; the
only suboptimal case there is for width=24. But specializing
an optimal variant for that would require more code, which
might not be worth it.
For the h16 case, this implementation would give a slowdown,
as it now loads the first 8 pixels separately from the rest, but
for larger widths, it is a gain. Therefore, keep the h16 case
as it was (but remove the outer loop), and create a new specialized
version for horizontal looping with 16 pixels at a time.
Before: Cortex A53 A72 A73 Graviton 3
put_hevc_qpel_h16_8_neon: 710.5 667.7 692.5 211.0
put_hevc_qpel_h32_8_neon: 2791.5 2643.5 2732.0 883.5
put_hevc_qpel_h64_8_neon: 10954.0 10657.0 10874.2 3241.5
After:
put_hevc_qpel_h16_8_neon: 697.5 663.5 705.7 212.5
put_hevc_qpel_h32_8_neon: 2767.2 2684.5 2791.2 920.5
put_hevc_qpel_h64_8_neon: 10559.2 10471.5 10932.2 3051.7
Signed-off-by: Martin Storsjö <martin@martin.st>
10 months ago
|
|
|
.ifc \type, qpel_bi
|
|
|
|
ld1 {v20.8h, v21.8h}, [ x4], #32
|
|
|
|
ld1 {v22.8h, v23.8h}, [x15], #32
|
|
|
|
sqadd v26.8h, v26.8h, v20.8h
|
|
|
|
sqadd v27.8h, v27.8h, v21.8h
|
|
|
|
sqadd v28.8h, v28.8h, v22.8h
|
|
|
|
sqadd v29.8h, v29.8h, v23.8h
|
|
|
|
sqrshrun v26.8b, v26.8h, #7
|
|
|
|
sqrshrun v27.8b, v27.8h, #7
|
|
|
|
sqrshrun v28.8b, v28.8h, #7
|
|
|
|
sqrshrun v29.8b, v29.8h, #7
|
|
|
|
.else
|
|
|
|
sqrshrun v26.8b, v26.8h, #6
|
|
|
|
sqrshrun v27.8b, v27.8h, #6
|
|
|
|
sqrshrun v28.8b, v28.8h, #6
|
|
|
|
sqrshrun v29.8b, v29.8h, #6
|
|
|
|
.endif
|
|
|
|
st1 {v26.8b, v27.8b}, [dst], #16
|
|
|
|
st1 {v28.8b, v29.8b}, [x10], #16
|
|
|
|
.endif
|
|
|
|
b.gt 1b // double line
|
|
|
|
subs height, height, #2
|
|
|
|
add src, src, x13
|
|
|
|
add x12, x12, x13
|
|
|
|
add dst, dst, x14
|
|
|
|
add x10, x10, x14
|
|
|
|
.ifc \type, qpel_bi
|
|
|
|
add x4, x4, x16
|
|
|
|
add x15, x15, x16
|
|
|
|
.endif
|
|
|
|
b.gt 0b
|
|
|
|
ret mx
|
|
|
|
endfunc
|
|
|
|
|
|
|
|
.unreq height
|
|
|
|
.unreq heightw
|
|
|
|
.unreq width
|
|
|
|
.unreq src
|
|
|
|
.unreq dst
|
|
|
|
.unreq srcstride
|
|
|
|
.unreq dststride
|
|
|
|
.unreq mx
|
|
|
|
.endm
|
|
|
|
|
|
|
|
put_hevc qpel
|
|
|
|
put_hevc qpel_uni
|
|
|
|
put_hevc qpel_bi
|
|
|
|
|
|
|
|
function ff_hevc_put_hevc_qpel_v4_8_neon, export=1
|
|
|
|
load_qpel_filterb x5, x4
|
|
|
|
sub x1, x1, x2, lsl #1
|
|
|
|
mov x9, #(MAX_PB_SIZE * 2)
|
|
|
|
sub x1, x1, x2
|
|
|
|
ldr s16, [x1]
|
|
|
|
ldr s17, [x1, x2]
|
|
|
|
add x1, x1, x2, lsl #1
|
|
|
|
ldr s18, [x1]
|
|
|
|
ldr s19, [x1, x2]
|
|
|
|
add x1, x1, x2, lsl #1
|
|
|
|
ldr s20, [x1]
|
|
|
|
ldr s21, [x1, x2]
|
|
|
|
add x1, x1, x2, lsl #1
|
|
|
|
ldr s22, [x1]
|
|
|
|
add x1, x1, x2
|
|
|
|
.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
|
|
|
|
ld1 {\tmp\().s}[0], [x1], x2
|
|
|
|
movi v24.8h, #0
|
|
|
|
calc_qpelb v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
|
|
|
|
st1 {v24.4h}, [x0], x9
|
|
|
|
subs w3, w3, #1
|
|
|
|
b.eq 2f
|
|
|
|
.endm
|
|
|
|
1: calc_all
|
|
|
|
.purgem calc
|
|
|
|
2: ret
|
|
|
|
endfunc
|
|
|
|
|
|
|
|
function ff_hevc_put_hevc_qpel_v6_8_neon, export=1
|
|
|
|
load_qpel_filterb x5, x4
|
|
|
|
sub x1, x1, x2, lsl #1
|
|
|
|
mov x9, #(MAX_PB_SIZE * 2 - 8)
|
|
|
|
sub x1, x1, x2
|
|
|
|
ldr d16, [x1]
|
|
|
|
ldr d17, [x1, x2]
|
|
|
|
add x1, x1, x2, lsl #1
|
|
|
|
ldr d18, [x1]
|
|
|
|
ldr d19, [x1, x2]
|
|
|
|
add x1, x1, x2, lsl #1
|
|
|
|
ldr d20, [x1]
|
|
|
|
ldr d21, [x1, x2]
|
|
|
|
add x1, x1, x2, lsl #1
|
|
|
|
ldr d22, [x1]
|
|
|
|
add x1, x1, x2
|
|
|
|
.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
|
|
|
|
ld1 {\tmp\().8b}, [x1], x2
|
|
|
|
movi v24.8h, #0
|
|
|
|
calc_qpelb v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
|
|
|
|
st1 {v24.4h}, [x0], #8
|
|
|
|
st1 {v24.s}[2], [x0], x9
|
|
|
|
subs w3, w3, #1
|
|
|
|
.endm
|
|
|
|
1: calc_all
|
|
|
|
.purgem calc
|
|
|
|
2: ret
|
|
|
|
endfunc
|
|
|
|
|
|
|
|
function ff_hevc_put_hevc_qpel_v8_8_neon, export=1
|
|
|
|
load_qpel_filterb x5, x4
|
|
|
|
sub x1, x1, x2, lsl #1
|
|
|
|
mov x9, #(MAX_PB_SIZE * 2)
|
|
|
|
sub x1, x1, x2
|
|
|
|
ldr d16, [x1]
|
|
|
|
ldr d17, [x1, x2]
|
|
|
|
add x1, x1, x2, lsl #1
|
|
|
|
ldr d18, [x1]
|
|
|
|
ldr d19, [x1, x2]
|
|
|
|
add x1, x1, x2, lsl #1
|
|
|
|
ldr d20, [x1]
|
|
|
|
ldr d21, [x1, x2]
|
|
|
|
add x1, x1, x2, lsl #1
|
|
|
|
ldr d22, [x1]
|
|
|
|
add x1, x1, x2
|
|
|
|
.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
|
|
|
|
ld1 {\tmp\().8b}, [x1], x2
|
|
|
|
movi v24.8h, #0
|
|
|
|
calc_qpelb v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
|
|
|
|
st1 {v24.8h}, [x0], x9
|
|
|
|
subs w3, w3, #1
|
|
|
|
.endm
|
|
|
|
1: calc_all
|
|
|
|
.purgem calc
|
|
|
|
2: ret
|
|
|
|
endfunc
|
|
|
|
|
|
|
|
function ff_hevc_put_hevc_qpel_v12_8_neon, export=1
|
|
|
|
load_qpel_filterb x5, x4
|
|
|
|
sub x1, x1, x2, lsl #1
|
|
|
|
mov x9, #(MAX_PB_SIZE * 2 - 16)
|
|
|
|
sub x1, x1, x2
|
|
|
|
ldr q16, [x1]
|
|
|
|
ldr q17, [x1, x2]
|
|
|
|
add x1, x1, x2, lsl #1
|
|
|
|
ldr q18, [x1]
|
|
|
|
ldr q19, [x1, x2]
|
|
|
|
add x1, x1, x2, lsl #1
|
|
|
|
ldr q20, [x1]
|
|
|
|
ldr q21, [x1, x2]
|
|
|
|
add x1, x1, x2, lsl #1
|
|
|
|
ldr q22, [x1]
|
|
|
|
add x1, x1, x2
|
|
|
|
.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
|
|
|
|
ld1 {\tmp\().16b}, [x1], x2
|
|
|
|
movi v24.8h, #0
|
|
|
|
movi v25.8h, #0
|
|
|
|
calc_qpelb v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
|
|
|
|
calc_qpelb2 v25, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
|
|
|
|
st1 {v24.8h}, [x0], #16
|
|
|
|
subs w3, w3, #1
|
|
|
|
st1 {v25.4h}, [x0], x9
|
|
|
|
.endm
|
|
|
|
1: calc_all
|
|
|
|
.purgem calc
|
|
|
|
2: ret
|
|
|
|
endfunc
|
|
|
|
|
|
|
|
function ff_hevc_put_hevc_qpel_v16_8_neon, export=1
|
|
|
|
load_qpel_filterb x5, x4
|
|
|
|
sub x1, x1, x2, lsl #1
|
|
|
|
mov x9, #(MAX_PB_SIZE * 2)
|
|
|
|
sub x1, x1, x2
|
|
|
|
ldr q16, [x1]
|
|
|
|
ldr q17, [x1, x2]
|
|
|
|
add x1, x1, x2, lsl #1
|
|
|
|
ldr q18, [x1]
|
|
|
|
ldr q19, [x1, x2]
|
|
|
|
add x1, x1, x2, lsl #1
|
|
|
|
ldr q20, [x1]
|
|
|
|
ldr q21, [x1, x2]
|
|
|
|
add x1, x1, x2, lsl #1
|
|
|
|
ldr q22, [x1]
|
|
|
|
add x1, x1, x2
|
|
|
|
.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
|
|
|
|
ld1 {\tmp\().16b}, [x1], x2
|
|
|
|
movi v24.8h, #0
|
|
|
|
movi v25.8h, #0
|
|
|
|
calc_qpelb v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
|
|
|
|
calc_qpelb2 v25, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
|
|
|
|
subs w3, w3, #1
|
|
|
|
st1 {v24.8h, v25.8h}, [x0], x9
|
|
|
|
.endm
|
|
|
|
1: calc_all
|
|
|
|
.purgem calc
|
|
|
|
2: ret
|
|
|
|
endfunc
|
|
|
|
|
|
|
|
// todo: reads #32 bytes
|
|
|
|
function ff_hevc_put_hevc_qpel_v24_8_neon, export=1
|
|
|
|
sub sp, sp, #32
|
|
|
|
st1 {v8.8b, v9.8b, v10.8b}, [sp]
|
|
|
|
load_qpel_filterb x5, x4
|
|
|
|
sub x1, x1, x2, lsl #1
|
|
|
|
sub x1, x1, x2
|
|
|
|
mov x9, #(MAX_PB_SIZE * 2)
|
|
|
|
ld1 {v16.16b, v17.16b}, [x1], x2
|
|
|
|
ld1 {v18.16b, v19.16b}, [x1], x2
|
|
|
|
ld1 {v20.16b, v21.16b}, [x1], x2
|
|
|
|
ld1 {v22.16b, v23.16b}, [x1], x2
|
|
|
|
ld1 {v24.16b, v25.16b}, [x1], x2
|
|
|
|
ld1 {v26.16b, v27.16b}, [x1], x2
|
|
|
|
ld1 {v28.16b, v29.16b}, [x1], x2
|
|
|
|
.macro calc tmp0, tmp1, src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11, src12, src13, src14, src15
|
|
|
|
ld1 {\tmp0\().16b, \tmp1\().16b}, [x1], x2
|
|
|
|
movi v8.8h, #0
|
|
|
|
movi v9.8h, #0
|
|
|
|
movi v10.8h, #0
|
|
|
|
calc_qpelb v8, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
|
|
|
|
calc_qpelb2 v9, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
|
|
|
|
calc_qpelb v10, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15
|
|
|
|
subs w3, w3, #1
|
|
|
|
st1 {v8.8h, v9.8h, v10.8h}, [x0], x9
|
|
|
|
.endm
|
|
|
|
1: calc_all2
|
|
|
|
.purgem calc
|
|
|
|
2: ld1 {v8.8b, v9.8b, v10.8b}, [sp]
|
|
|
|
add sp, sp, #32
|
|
|
|
ret
|
|
|
|
endfunc
|
|
|
|
|
|
|
|
function ff_hevc_put_hevc_qpel_v32_8_neon, export=1
|
|
|
|
sub sp, sp, #32
|
|
|
|
st1 {v8.8b-v11.8b}, [sp]
|
|
|
|
load_qpel_filterb x5, x4
|
|
|
|
sub x1, x1, x2, lsl #1
|
|
|
|
mov x9, #(MAX_PB_SIZE * 2)
|
|
|
|
sub x1, x1, x2
|
|
|
|
ld1 {v16.16b, v17.16b}, [x1], x2
|
|
|
|
ld1 {v18.16b, v19.16b}, [x1], x2
|
|
|
|
ld1 {v20.16b, v21.16b}, [x1], x2
|
|
|
|
ld1 {v22.16b, v23.16b}, [x1], x2
|
|
|
|
ld1 {v24.16b, v25.16b}, [x1], x2
|
|
|
|
ld1 {v26.16b, v27.16b}, [x1], x2
|
|
|
|
ld1 {v28.16b, v29.16b}, [x1], x2
|
|
|
|
.macro calc tmp0, tmp1, src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11, src12, src13, src14, src15
|
|
|
|
ld1 {\tmp0\().16b, \tmp1\().16b}, [x1], x2
|
|
|
|
movi v8.8h, #0
|
|
|
|
movi v9.8h, #0
|
|
|
|
movi v10.8h, #0
|
|
|
|
movi v11.8h, #0
|
|
|
|
calc_qpelb v8, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
|
|
|
|
calc_qpelb2 v9, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
|
|
|
|
calc_qpelb v10, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15
|
|
|
|
calc_qpelb2 v11, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15
|
|
|
|
subs w3, w3, #1
|
|
|
|
st1 {v8.8h-v11.8h}, [x0], x9
|
|
|
|
.endm
|
|
|
|
1: calc_all2
|
|
|
|
.purgem calc
|
|
|
|
2: ld1 {v8.8b-v11.8b}, [sp], #32
|
|
|
|
ret
|
|
|
|
endfunc
|
|
|
|
|
|
|
|
function ff_hevc_put_hevc_qpel_v48_8_neon, export=1
|
|
|
|
stp x2, x3, [sp, #-48]!
|
|
|
|
stp x0, x1, [sp, #16]
|
|
|
|
stp x5, x30, [sp, #32]
|
|
|
|
bl X(ff_hevc_put_hevc_qpel_v24_8_neon)
|
|
|
|
ldr x5, [sp, #32]
|
|
|
|
ldp x0, x1, [sp, #16]
|
|
|
|
ldp x2, x3, [sp], #32
|
|
|
|
add x0, x0, #48
|
|
|
|
add x1, x1, #24
|
|
|
|
bl X(ff_hevc_put_hevc_qpel_v24_8_neon)
|
|
|
|
ldr x30, [sp, #8]
|
|
|
|
add sp, sp, #16
|
|
|
|
ret
|
|
|
|
endfunc
|
|
|
|
|
|
|
|
function ff_hevc_put_hevc_qpel_v64_8_neon, export=1
|
|
|
|
sub sp, sp, #32
|
|
|
|
st1 {v8.8b-v11.8b}, [sp]
|
|
|
|
load_qpel_filterb x5, x4
|
|
|
|
sub x1, x1, x2, lsl #1
|
|
|
|
sub x1, x1, x2
|
|
|
|
mov x9, #(MAX_PB_SIZE * 2)
|
|
|
|
0: mov x8, x1 // src
|
|
|
|
ld1 {v16.16b, v17.16b}, [x8], x2
|
|
|
|
mov w11, w3 // height
|
|
|
|
ld1 {v18.16b, v19.16b}, [x8], x2
|
|
|
|
mov x10, x0 // dst
|
|
|
|
ld1 {v20.16b, v21.16b}, [x8], x2
|
|
|
|
ld1 {v22.16b, v23.16b}, [x8], x2
|
|
|
|
ld1 {v24.16b, v25.16b}, [x8], x2
|
|
|
|
ld1 {v26.16b, v27.16b}, [x8], x2
|
|
|
|
ld1 {v28.16b, v29.16b}, [x8], x2
|
|
|
|
.macro calc tmp0, tmp1, src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11, src12, src13, src14, src15
|
|
|
|
ld1 {\tmp0\().16b, \tmp1\().16b}, [x8], x2
|
|
|
|
movi v8.8h, #0
|
|
|
|
movi v9.8h, #0
|
|
|
|
movi v10.8h, #0
|
|
|
|
movi v11.8h, #0
|
|
|
|
calc_qpelb v8, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
|
|
|
|
calc_qpelb2 v9, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
|
|
|
|
calc_qpelb v10, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15
|
|
|
|
calc_qpelb2 v11, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15
|
|
|
|
subs x11, x11, #1
|
|
|
|
st1 {v8.8h-v11.8h}, [x10], x9
|
|
|
|
.endm
|
|
|
|
1: calc_all2
|
|
|
|
.purgem calc
|
|
|
|
2: add x0, x0, #64
|
|
|
|
add x1, x1, #32
|
|
|
|
subs w6, w6, #32
|
|
|
|
b.hi 0b
|
|
|
|
ld1 {v8.8b-v11.8b}, [sp], #32
|
|
|
|
ret
|
|
|
|
endfunc
|
|
|
|
|
|
|
|
function ff_hevc_put_hevc_qpel_bi_v4_8_neon, export=1
|
|
|
|
load_qpel_filterb x7, x6
|
|
|
|
sub x2, x2, x3, lsl #1
|
|
|
|
sub x2, x2, x3
|
|
|
|
mov x12, #(MAX_PB_SIZE * 2)
|
|
|
|
ld1 {v16.s}[0], [x2], x3
|
|
|
|
ld1 {v17.s}[0], [x2], x3
|
|
|
|
ld1 {v18.s}[0], [x2], x3
|
|
|
|
ld1 {v19.s}[0], [x2], x3
|
|
|
|
ld1 {v20.s}[0], [x2], x3
|
|
|
|
ld1 {v21.s}[0], [x2], x3
|
|
|
|
ld1 {v22.s}[0], [x2], x3
|
|
|
|
.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
|
|
|
|
ld1 {\tmp\().s}[0], [x2], x3
|
|
|
|
movi v24.8h, #0
|
|
|
|
calc_qpelb v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
|
|
|
|
ld1 {v25.4h}, [x4], x12 // src2
|
|
|
|
sqadd v24.8h, v24.8h, v25.8h
|
|
|
|
sqrshrun v25.8b, v24.8h, #7
|
|
|
|
subs w5, w5, #1
|
|
|
|
st1 {v25.s}[0], [x0], x1
|
|
|
|
.endm
|
|
|
|
1: calc_all
|
|
|
|
.purgem calc
|
|
|
|
2: ret
|
|
|
|
endfunc
|
|
|
|
|
|
|
|
function ff_hevc_put_hevc_qpel_bi_v6_8_neon, export=1
|
|
|
|
load_qpel_filterb x7, x6
|
|
|
|
sub x2, x2, x3, lsl #1
|
|
|
|
sub x2, x2, x3
|
|
|
|
ld1 {v16.8b}, [x2], x3
|
|
|
|
sub x1, x1, #4
|
|
|
|
ld1 {v17.8b}, [x2], x3
|
|
|
|
mov x12, #(MAX_PB_SIZE * 2)
|
|
|
|
ld1 {v18.8b}, [x2], x3
|
|
|
|
ld1 {v19.8b}, [x2], x3
|
|
|
|
ld1 {v20.8b}, [x2], x3
|
|
|
|
ld1 {v21.8b}, [x2], x3
|
|
|
|
ld1 {v22.8b}, [x2], x3
|
|
|
|
.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
|
|
|
|
ld1 {\tmp\().8b}, [x2], x3
|
|
|
|
movi v24.8h, #0
|
|
|
|
calc_qpelb v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
|
|
|
|
ld1 {v25.8h}, [x4], x12 // src2
|
|
|
|
sqadd v24.8h, v24.8h, v25.8h
|
|
|
|
sqrshrun v25.8b, v24.8h, #7
|
|
|
|
st1 {v25.s}[0], [x0], #4
|
|
|
|
subs w5, w5, #1
|
|
|
|
st1 {v25.h}[2], [x0], x1
|
|
|
|
.endm
|
|
|
|
1: calc_all
|
|
|
|
.purgem calc
|
|
|
|
2: ret
|
|
|
|
endfunc
|
|
|
|
|
|
|
|
function ff_hevc_put_hevc_qpel_bi_v8_8_neon, export=1
|
|
|
|
load_qpel_filterb x7, x6
|
|
|
|
sub x2, x2, x3, lsl #1
|
|
|
|
sub x2, x2, x3
|
|
|
|
mov x12, #(MAX_PB_SIZE * 2)
|
|
|
|
ld1 {v16.8b}, [x2], x3
|
|
|
|
ld1 {v17.8b}, [x2], x3
|
|
|
|
ld1 {v18.8b}, [x2], x3
|
|
|
|
ld1 {v19.8b}, [x2], x3
|
|
|
|
ld1 {v20.8b}, [x2], x3
|
|
|
|
ld1 {v21.8b}, [x2], x3
|
|
|
|
ld1 {v22.8b}, [x2], x3
|
|
|
|
.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
|
|
|
|
ld1 {\tmp\().8b}, [x2], x3
|
|
|
|
movi v24.8h, #0
|
|
|
|
calc_qpelb v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
|
|
|
|
ld1 {v25.8h}, [x4], x12 // src2
|
|
|
|
sqadd v24.8h, v24.8h, v25.8h
|
|
|
|
sqrshrun v25.8b, v24.8h, #7
|
|
|
|
subs w5, w5, #1
|
|
|
|
st1 {v25.8b}, [x0], x1
|
|
|
|
.endm
|
|
|
|
1: calc_all
|
|
|
|
.purgem calc
|
|
|
|
2: ret
|
|
|
|
endfunc
|
|
|
|
|
|
|
|
function ff_hevc_put_hevc_qpel_bi_v12_8_neon, export=1
|
|
|
|
load_qpel_filterb x7, x6
|
|
|
|
sub x2, x2, x3, lsl #1
|
|
|
|
sub x2, x2, x3
|
|
|
|
sub x1, x1, #8
|
|
|
|
ld1 {v16.16b}, [x2], x3
|
|
|
|
mov x12, #(MAX_PB_SIZE * 2)
|
|
|
|
ld1 {v17.16b}, [x2], x3
|
|
|
|
ld1 {v18.16b}, [x2], x3
|
|
|
|
ld1 {v19.16b}, [x2], x3
|
|
|
|
ld1 {v20.16b}, [x2], x3
|
|
|
|
ld1 {v21.16b}, [x2], x3
|
|
|
|
ld1 {v22.16b}, [x2], x3
|
|
|
|
.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
|
|
|
|
ld1 {\tmp\().16b}, [x2], x3
|
|
|
|
movi v24.8h, #0
|
|
|
|
movi v25.8h, #0
|
|
|
|
calc_qpelb v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
|
|
|
|
calc_qpelb2 v25, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
|
|
|
|
ld1 {v26.8h, v27.8h}, [x4], x12 // src2
|
|
|
|
sqadd v24.8h, v24.8h, v26.8h
|
|
|
|
sqadd v25.8h, v25.8h, v27.8h
|
|
|
|
sqrshrun v26.8b, v24.8h, #7
|
|
|
|
sqrshrun2 v26.16b, v25.8h, #7
|
|
|
|
st1 {v26.8b}, [x0], #8
|
|
|
|
subs w5, w5, #1
|
|
|
|
st1 {v26.s}[2], [x0], x1
|
|
|
|
.endm
|
|
|
|
1: calc_all
|
|
|
|
.purgem calc
|
|
|
|
2: ret
|
|
|
|
endfunc
|
|
|
|
|
|
|
|
function ff_hevc_put_hevc_qpel_bi_v16_8_neon, export=1
|
|
|
|
load_qpel_filterb x7, x6
|
|
|
|
sub x2, x2, x3, lsl #1
|
|
|
|
sub x2, x2, x3
|
|
|
|
mov x12, #(MAX_PB_SIZE * 2)
|
|
|
|
ld1 {v16.16b}, [x2], x3
|
|
|
|
ld1 {v17.16b}, [x2], x3
|
|
|
|
ld1 {v18.16b}, [x2], x3
|
|
|
|
ld1 {v19.16b}, [x2], x3
|
|
|
|
ld1 {v20.16b}, [x2], x3
|
|
|
|
ld1 {v21.16b}, [x2], x3
|
|
|
|
ld1 {v22.16b}, [x2], x3
|
|
|
|
.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
|
|
|
|
ld1 {\tmp\().16b}, [x2], x3
|
|
|
|
movi v24.8h, #0
|
|
|
|
movi v25.8h, #0
|
|
|
|
calc_qpelb v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
|
|
|
|
calc_qpelb2 v25, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
|
|
|
|
ld1 {v26.8h, v27.8h}, [x4], x12 // src2
|
|
|
|
sqadd v24.8h, v24.8h, v26.8h
|
|
|
|
sqadd v25.8h, v25.8h, v27.8h
|
|
|
|
sqrshrun v26.8b, v24.8h, #7
|
|
|
|
subs w5, w5, #1
|
|
|
|
sqrshrun2 v26.16b, v25.8h, #7
|
|
|
|
st1 {v26.16b}, [x0], x1
|
|
|
|
.endm
|
|
|
|
1: calc_all
|
|
|
|
.purgem calc
|
|
|
|
2: ret
|
|
|
|
endfunc
|
|
|
|
|
|
|
|
function ff_hevc_put_hevc_qpel_bi_v24_8_neon, export=1
|
|
|
|
stp x4, x5, [sp, #-64]!
|
|
|
|
stp x2, x3, [sp, #16]
|
|
|
|
stp x0, x1, [sp, #32]
|
|
|
|
stp x7, x30, [sp, #48]
|
|
|
|
bl X(ff_hevc_put_hevc_qpel_bi_v16_8_neon)
|
|
|
|
ldp x2, x3, [sp, #16]
|
|
|
|
ldp x0, x1, [sp, #32]
|
|
|
|
ldr x7, [sp, #48]
|
|
|
|
ldp x4, x5, [sp], #48
|
|
|
|
add x0, x0, #16
|
|
|
|
add x2, x2, #16
|
|
|
|
add x4, x4, #32
|
|
|
|
bl X(ff_hevc_put_hevc_qpel_bi_v8_8_neon)
|
|
|
|
ldr x30, [sp, #8]
|
|
|
|
add sp, sp, #16
|
|
|
|
ret
|
|
|
|
endfunc
|
|
|
|
|
|
|
|
function ff_hevc_put_hevc_qpel_bi_v32_8_neon, export=1
|
|
|
|
stp d8, d9, [sp, #-64]!
|
|
|
|
stp d10, d11, [sp, #16]
|
|
|
|
stp d12, d13, [sp, #32]
|
|
|
|
stp d14, d15, [sp, #48]
|
|
|
|
sub x2, x2, x3, lsl #1
|
|
|
|
sub x2, x2, x3
|
|
|
|
load_qpel_filterb x7, x6
|
|
|
|
ldr w6, [sp, #64]
|
|
|
|
mov x12, #(MAX_PB_SIZE * 2)
|
|
|
|
0: mov x8, x2 // src
|
|
|
|
ld1 {v16.16b, v17.16b}, [x8], x3
|
|
|
|
mov w11, w5 // height
|
|
|
|
ld1 {v18.16b, v19.16b}, [x8], x3
|
|
|
|
mov x10, x0 // dst
|
|
|
|
ld1 {v20.16b, v21.16b}, [x8], x3
|
|
|
|
mov x9, x4 // src2
|
|
|
|
ld1 {v22.16b, v23.16b}, [x8], x3
|
|
|
|
ld1 {v24.16b, v25.16b}, [x8], x3
|
|
|
|
ld1 {v26.16b, v27.16b}, [x8], x3
|
|
|
|
ld1 {v28.16b, v29.16b}, [x8], x3
|
|
|
|
.macro calc tmp0, tmp1, src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11, src12, src13, src14, src15
|
|
|
|
ld1 {\tmp0\().8h, \tmp1\().8h}, [x8], x3
|
|
|
|
movi v8.8h, #0
|
|
|
|
movi v9.8h, #0
|
|
|
|
movi v10.8h, #0
|
|
|
|
movi v11.8h, #0
|
|
|
|
calc_qpelb v8, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
|
|
|
|
calc_qpelb2 v9, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
|
|
|
|
calc_qpelb v10, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15
|
|
|
|
calc_qpelb2 v11, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15
|
|
|
|
ld1 {v12.8h, v13.8h, v14.8h, v15.8h}, [x9], x12 // src2
|
|
|
|
sqadd v8.8h, v8.8h, v12.8h
|
|
|
|
sqadd v9.8h, v9.8h, v13.8h
|
|
|
|
sqadd v10.8h, v10.8h, v14.8h
|
|
|
|
sqadd v11.8h, v11.8h, v15.8h
|
|
|
|
sqrshrun v12.8b, v8.8h, #7
|
|
|
|
sqrshrun2 v12.16b, v9.8h, #7
|
|
|
|
sqrshrun v13.8b, v10.8h, #7
|
|
|
|
sqrshrun2 v13.16b, v11.8h, #7
|
|
|
|
subs x11, x11, #1
|
|
|
|
st1 {v12.16b, v13.16b}, [x10], x1
|
|
|
|
.endm
|
|
|
|
1: calc_all2
|
|
|
|
.purgem calc
|
|
|
|
2: add x0, x0, #32 // dst
|
|
|
|
add x2, x2, #32 // src
|
|
|
|
add x4, x4, #64 // src2
|
|
|
|
subs w6, w6, #32
|
|
|
|
b.ne 0b
|
|
|
|
ldp d10, d11, [sp, #16]
|
|
|
|
ldp d12, d13, [sp, #32]
|
|
|
|
ldp d14, d15, [sp, #48]
|
|
|
|
ldp d8, d9, [sp], #64
|
|
|
|
ret
|
|
|
|
endfunc
|
|
|
|
|
|
|
|
function ff_hevc_put_hevc_qpel_bi_v48_8_neon, export=1
|
|
|
|
mov x8, #32
|
|
|
|
str x8, [sp, #-80]!
|
|
|
|
stp x4, x5, [sp, #16]
|
|
|
|
stp x2, x3, [sp, #32]
|
|
|
|
stp x0, x1, [sp, #48]
|
|
|
|
stp x7, x30, [sp, #64]
|
|
|
|
bl X(ff_hevc_put_hevc_qpel_bi_v32_8_neon)
|
|
|
|
ldp x4, x5, [sp, #16]
|
|
|
|
ldp x2, x3, [sp, #32]
|
|
|
|
ldp x0, x1, [sp, #48]
|
|
|
|
ldr x7, [sp, #64]
|
|
|
|
add sp, sp, #64
|
|
|
|
add x0, x0, #32
|
|
|
|
add x2, x2, #32
|
|
|
|
add x4, x4, #64
|
|
|
|
bl X(ff_hevc_put_hevc_qpel_bi_v16_8_neon)
|
|
|
|
ldr x30, [sp, #8]
|
|
|
|
add sp, sp, #16
|
|
|
|
ret
|
|
|
|
endfunc
|
|
|
|
|
|
|
|
function ff_hevc_put_hevc_qpel_bi_v64_8_neon, export=1
|
|
|
|
b X(ff_hevc_put_hevc_qpel_bi_v32_8_neon)
|
|
|
|
endfunc
|
|
|
|
|
|
|
|
function ff_hevc_put_hevc_pel_uni_pixels4_8_neon, export=1
|
|
|
|
1:
|
|
|
|
ldr s0, [x2]
|
|
|
|
ldr s1, [x2, x3]
|
|
|
|
subs w4, w4, #2
|
|
|
|
add x2, x2, x3, lsl #1
|
|
|
|
str s0, [x0]
|
|
|
|
str s1, [x0, x1]
|
|
|
|
add x0, x0, x1, lsl #1
|
|
|
|
b.hi 1b
|
|
|
|
ret
|
|
|
|
endfunc
|
|
|
|
|
|
|
|
function ff_hevc_put_hevc_pel_uni_pixels6_8_neon, export=1
|
|
|
|
sub x1, x1, #4
|
|
|
|
1:
|
|
|
|
ldr d0, [x2]
|
|
|
|
ldr d1, [x2, x3]
|
|
|
|
subs w4, w4, #2
|
|
|
|
add x2, x2, x3, lsl #1
|
|
|
|
str s0, [x0], #4
|
|
|
|
st1 {v0.h}[2], [x0], x1
|
|
|
|
str s1, [x0], #4
|
|
|
|
st1 {v1.h}[2], [x0], x1
|
|
|
|
b.hi 1b
|
|
|
|
ret
|
|
|
|
endfunc
|
|
|
|
|
|
|
|
function ff_hevc_put_hevc_pel_uni_pixels8_8_neon, export=1
|
|
|
|
1:
|
|
|
|
ldr d0, [x2]
|
|
|
|
ldr d1, [x2, x3]
|
|
|
|
subs w4, w4, #2
|
|
|
|
add x2, x2, x3, lsl #1
|
|
|
|
str d0, [x0]
|
|
|
|
str d1, [x0, x1]
|
|
|
|
add x0, x0, x1, lsl #1
|
|
|
|
b.hi 1b
|
|
|
|
ret
|
|
|
|
endfunc
|
|
|
|
|
|
|
|
function ff_hevc_put_hevc_pel_uni_pixels12_8_neon, export=1
|
|
|
|
sub x1, x1, #8
|
|
|
|
1:
|
|
|
|
ldr q0, [x2]
|
|
|
|
ldr q1, [x2, x3]
|
|
|
|
subs w4, w4, #2
|
|
|
|
add x2, x2, x3, lsl #1
|
|
|
|
str d0, [x0], #8
|
|
|
|
st1 {v0.s}[2], [x0], x1
|
|
|
|
str d1, [x0], #8
|
|
|
|
st1 {v1.s}[2], [x0], x1
|
|
|
|
b.hi 1b
|
|
|
|
ret
|
|
|
|
endfunc
|
|
|
|
|
|
|
|
function ff_hevc_put_hevc_pel_uni_pixels16_8_neon, export=1
|
|
|
|
1:
|
|
|
|
ldr q0, [x2]
|
|
|
|
ldr q1, [x2, x3]
|
|
|
|
subs w4, w4, #2
|
|
|
|
add x2, x2, x3, lsl #1
|
|
|
|
str q0, [x0]
|
|
|
|
str q1, [x0, x1]
|
|
|
|
add x0, x0, x1, lsl #1
|
|
|
|
b.hi 1b
|
|
|
|
ret
|
|
|
|
endfunc
|
|
|
|
|
|
|
|
function ff_hevc_put_hevc_pel_uni_pixels24_8_neon, export=1
|
|
|
|
1:
|
|
|
|
ld1 {v0.8b, v1.8b, v2.8b}, [x2], x3
|
|
|
|
subs w4, w4, #1
|
|
|
|
st1 {v0.8b, v1.8b, v2.8b}, [x0], x1
|
|
|
|
b.hi 1b
|
|
|
|
ret
|
|
|
|
endfunc
|
|
|
|
|
|
|
|
function ff_hevc_put_hevc_pel_uni_pixels32_8_neon, export=1
|
|
|
|
1:
|
|
|
|
ld1 {v0.16b, v1.16b}, [x2], x3
|
|
|
|
subs w4, w4, #1
|
|
|
|
st1 {v0.16b, v1.16b}, [x0], x1
|
|
|
|
b.hi 1b
|
|
|
|
ret
|
|
|
|
endfunc
|
|
|
|
|
|
|
|
function ff_hevc_put_hevc_pel_uni_pixels48_8_neon, export=1
|
|
|
|
1:
|
|
|
|
ld1 {v0.16b, v1.16b, v2.16b}, [x2], x3
|
|
|
|
subs w4, w4, #1
|
|
|
|
st1 {v0.16b, v1.16b, v2.16b}, [x0], x1
|
|
|
|
b.hi 1b
|
|
|
|
ret
|
|
|
|
endfunc
|
|
|
|
|
|
|
|
function ff_hevc_put_hevc_pel_uni_pixels64_8_neon, export=1
|
|
|
|
1:
|
|
|
|
ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], x3
|
|
|
|
subs w4, w4, #1
|
|
|
|
st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
|
|
|
|
b.hi 1b
|
|
|
|
ret
|
|
|
|
endfunc
|
|
|
|
|
|
|
|
function ff_hevc_put_hevc_qpel_uni_v4_8_neon, export=1
|
|
|
|
load_qpel_filterb x6, x5
|
|
|
|
sub x2, x2, x3, lsl #1
|
|
|
|
sub x2, x2, x3
|
|
|
|
ldr s16, [x2]
|
|
|
|
ldr s17, [x2, x3]
|
|
|
|
add x2, x2, x3, lsl #1
|
|
|
|
ldr s18, [x2]
|
|
|
|
ldr s19, [x2, x3]
|
|
|
|
add x2, x2, x3, lsl #1
|
|
|
|
ldr s20, [x2]
|
|
|
|
ldr s21, [x2, x3]
|
|
|
|
add x2, x2, x3, lsl #1
|
|
|
|
ldr s22, [x2]
|
|
|
|
add x2, x2, x3
|
|
|
|
.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
|
|
|
|
ld1 {\tmp\().s}[0], [x2], x3
|
|
|
|
calc_qpelb v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
|
|
|
|
sqrshrun v24.8b, v24.8h, #6
|
|
|
|
subs w4, w4, #1
|
|
|
|
st1 {v24.s}[0], [x0], x1
|
|
|
|
.endm
|
|
|
|
1: calc_all
|
|
|
|
.purgem calc
|
|
|
|
2: ret
|
|
|
|
endfunc
|
|
|
|
|
|
|
|
function ff_hevc_put_hevc_qpel_uni_v6_8_neon, export=1
|
|
|
|
load_qpel_filterb x6, x5
|
|
|
|
sub x2, x2, x3, lsl #1
|
|
|
|
sub x1, x1, #4
|
|
|
|
sub x2, x2, x3
|
|
|
|
ldr d16, [x2]
|
|
|
|
ldr d17, [x2, x3]
|
|
|
|
add x2, x2, x3, lsl #1
|
|
|
|
ldr d18, [x2]
|
|
|
|
ldr d19, [x2, x3]
|
|
|
|
add x2, x2, x3, lsl #1
|
|
|
|
ldr d20, [x2]
|
|
|
|
ldr d21, [x2, x3]
|
|
|
|
add x2, x2, x3, lsl #1
|
|
|
|
ldr d22, [x2]
|
|
|
|
add x2, x2, x3
|
|
|
|
.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
|
|
|
|
ld1 {\tmp\().8b}, [x2], x3
|
|
|
|
calc_qpelb v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
|
|
|
|
sqrshrun v24.8b, v24.8h, #6
|
|
|
|
st1 {v24.s}[0], [x0], #4
|
|
|
|
subs w4, w4, #1
|
|
|
|
st1 {v24.h}[2], [x0], x1
|
|
|
|
.endm
|
|
|
|
1: calc_all
|
|
|
|
.purgem calc
|
|
|
|
2: ret
|
|
|
|
endfunc
|
|
|
|
|
|
|
|
function ff_hevc_put_hevc_qpel_uni_v8_8_neon, export=1
|
|
|
|
load_qpel_filterb x6, x5
|
|
|
|
sub x2, x2, x3, lsl #1
|
|
|
|
sub x2, x2, x3
|
|
|
|
ldr d16, [x2]
|
|
|
|
ldr d17, [x2, x3]
|
|
|
|
add x2, x2, x3, lsl #1
|
|
|
|
ldr d18, [x2]
|
|
|
|
ldr d19, [x2, x3]
|
|
|
|
add x2, x2, x3, lsl #1
|
|
|
|
ldr d20, [x2]
|
|
|
|
ldr d21, [x2, x3]
|
|
|
|
add x2, x2, x3, lsl #1
|
|
|
|
ldr d22, [x2]
|
|
|
|
add x2, x2, x3
|
|
|
|
.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
|
|
|
|
ld1 {\tmp\().8b}, [x2], x3
|
|
|
|
calc_qpelb v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
|
|
|
|
sqrshrun v24.8b, v24.8h, #6
|
|
|
|
subs w4, w4, #1
|
|
|
|
st1 {v24.8b}, [x0], x1
|
|
|
|
.endm
|
|
|
|
1: calc_all
|
|
|
|
.purgem calc
|
|
|
|
2: ret
|
|
|
|
endfunc
|
|
|
|
|
|
|
|
function ff_hevc_put_hevc_qpel_uni_v12_8_neon, export=1
|
|
|
|
load_qpel_filterb x6, x5
|
|
|
|
sub x2, x2, x3, lsl #1
|
|
|
|
sub x1, x1, #8
|
|
|
|
sub x2, x2, x3
|
|
|
|
0: mov x8, x2 // src
|
|
|
|
mov w11, w4 // height
|
|
|
|
mov x10, x0 // dst
|
|
|
|
ldr q16, [x8]
|
|
|
|
ldr q17, [x8, x3]
|
|
|
|
add x8, x8, x3, lsl #1
|
|
|
|
ldr q18, [x8]
|
|
|
|
ldr q19, [x8, x3]
|
|
|
|
add x8, x8, x3, lsl #1
|
|
|
|
ldr q20, [x8]
|
|
|
|
ldr q21, [x8, x3]
|
|
|
|
add x8, x8, x3, lsl #1
|
|
|
|
ldr q22, [x8]
|
|
|
|
add x8, x8, x3
|
|
|
|
.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
|
|
|
|
ld1 {\tmp\().16b}, [x8], x3
|
|
|
|
calc_qpelb v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
|
|
|
|
calc_qpelb2 v25, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
|
|
|
|
sqrshrun v24.8b, v24.8h, #6
|
|
|
|
sqrshrun2 v24.16b, v25.8h, #6
|
|
|
|
st1 {v24.8b}, [x10], #8
|
|
|
|
subs x11, x11, #1
|
|
|
|
st1 {v24.s}[2], [x10], x1
|
|
|
|
.endm
|
|
|
|
1: calc_all
|
|
|
|
.purgem calc
|
|
|
|
2: add x0, x0, #12
|
|
|
|
add x2, x2, #12
|
|
|
|
subs w7, w7, #12
|
|
|
|
b.ne 0b
|
|
|
|
ret
|
|
|
|
endfunc
|
|
|
|
|
|
|
|
function ff_hevc_put_hevc_qpel_uni_v16_8_neon, export=1
|
|
|
|
load_qpel_filterb x6, x5
|
|
|
|
sub x2, x2, x3, lsl #1
|
|
|
|
sub x2, x2, x3
|
|
|
|
0: mov x8, x2 // src
|
|
|
|
mov w11, w4 // height
|
|
|
|
mov x10, x0 // dst
|
|
|
|
ldr q16, [x8]
|
|
|
|
ldr q17, [x8, x3]
|
|
|
|
add x8, x8, x3, lsl #1
|
|
|
|
ldr q18, [x8]
|
|
|
|
ldr q19, [x8, x3]
|
|
|
|
add x8, x8, x3, lsl #1
|
|
|
|
ldr q20, [x8]
|
|
|
|
ldr q21, [x8, x3]
|
|
|
|
add x8, x8, x3, lsl #1
|
|
|
|
ldr q22, [x8]
|
|
|
|
add x8, x8, x3
|
|
|
|
.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
|
|
|
|
ld1 {\tmp\().16b}, [x8], x3
|
|
|
|
calc_qpelb v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
|
|
|
|
calc_qpelb2 v25, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
|
|
|
|
sqrshrun v24.8b, v24.8h, #6
|
|
|
|
sqrshrun2 v24.16b, v25.8h, #6
|
|
|
|
subs x11, x11, #1
|
|
|
|
st1 {v24.16b}, [x10], x1
|
|
|
|
.endm
|
|
|
|
1: calc_all
|
|
|
|
.purgem calc
|
|
|
|
2: add x0, x0, #16
|
|
|
|
add x2, x2, #16
|
|
|
|
subs w7, w7, #16
|
|
|
|
b.ne 0b
|
|
|
|
ret
|
|
|
|
endfunc
|
|
|
|
|
|
|
|
function ff_hevc_put_hevc_qpel_uni_v24_8_neon, export=1
|
|
|
|
b X(ff_hevc_put_hevc_qpel_uni_v12_8_neon)
|
|
|
|
endfunc
|
|
|
|
|
|
|
|
function ff_hevc_put_hevc_qpel_uni_v32_8_neon, export=1
|
|
|
|
b X(ff_hevc_put_hevc_qpel_uni_v16_8_neon)
|
|
|
|
endfunc
|
|
|
|
|
|
|
|
function ff_hevc_put_hevc_qpel_uni_v48_8_neon, export=1
|
|
|
|
b X(ff_hevc_put_hevc_qpel_uni_v16_8_neon)
|
|
|
|
endfunc
|
|
|
|
|
|
|
|
function ff_hevc_put_hevc_qpel_uni_v64_8_neon, export=1
|
|
|
|
b X(ff_hevc_put_hevc_qpel_uni_v16_8_neon)
|
|
|
|
endfunc
|
|
|
|
|
|
|
|
function ff_hevc_put_hevc_pel_uni_w_pixels4_8_neon, export=1
|
|
|
|
mov w10, #-6
|
|
|
|
sub w10, w10, w5
|
|
|
|
dup v30.8h, w6
|
|
|
|
dup v31.4s, w10
|
|
|
|
dup v29.4s, w7
|
|
|
|
1:
|
|
|
|
ldr s0, [x2]
|
|
|
|
ldr s1, [x2, x3]
|
|
|
|
add x2, x2, x3, lsl #1
|
|
|
|
ushll v0.8h, v0.8b, #6
|
|
|
|
ushll v1.8h, v1.8b, #6
|
|
|
|
smull v0.4s, v0.4h, v30.4h
|
|
|
|
smull v1.4s, v1.4h, v30.4h
|
|
|
|
sqrshl v0.4s, v0.4s, v31.4s
|
|
|
|
sqrshl v1.4s, v1.4s, v31.4s
|
|
|
|
sqadd v0.4s, v0.4s, v29.4s
|
|
|
|
sqadd v1.4s, v1.4s, v29.4s
|
|
|
|
sqxtn v0.4h, v0.4s
|
|
|
|
sqxtn v1.4h, v1.4s
|
|
|
|
sqxtun v0.8b, v0.8h
|
|
|
|
sqxtun v1.8b, v1.8h
|
|
|
|
str s0, [x0]
|
|
|
|
str s1, [x0, x1]
|
|
|
|
add x0, x0, x1, lsl #1
|
|
|
|
subs w4, w4, #2
|
|
|
|
b.ne 1b
|
|
|
|
ret
|
|
|
|
endfunc
|
|
|
|
|
|
|
|
function ff_hevc_put_hevc_pel_uni_w_pixels6_8_neon, export=1
|
|
|
|
mov w10, #-6
|
|
|
|
sub w10, w10, w5
|
|
|
|
dup v30.8h, w6
|
|
|
|
dup v31.4s, w10
|
|
|
|
dup v29.4s, w7
|
|
|
|
sub x1, x1, #4
|
|
|
|
1:
|
|
|
|
ldr d0, [x2]
|
|
|
|
ldr d1, [x2, x3]
|
|
|
|
add x2, x2, x3, lsl #1
|
|
|
|
ushll v0.8h, v0.8b, #6
|
|
|
|
ushll v1.8h, v1.8b, #6
|
|
|
|
smull v4.4s, v0.4h, v30.4h
|
|
|
|
smull2 v5.4s, v0.8h, v30.8h
|
|
|
|
smull v6.4s, v1.4h, v30.4h
|
|
|
|
smull2 v7.4s, v1.8h, v30.8h
|
|
|
|
sqrshl v4.4s, v4.4s, v31.4s
|
|
|
|
sqrshl v5.4s, v5.4s, v31.4s
|
|
|
|
sqrshl v6.4s, v6.4s, v31.4s
|
|
|
|
sqrshl v7.4s, v7.4s, v31.4s
|
|
|
|
sqadd v4.4s, v4.4s, v29.4s
|
|
|
|
sqadd v5.4s, v5.4s, v29.4s
|
|
|
|
sqadd v6.4s, v6.4s, v29.4s
|
|
|
|
sqadd v7.4s, v7.4s, v29.4s
|
|
|
|
sqxtn v0.4h, v4.4s
|
|
|
|
sqxtn2 v0.8h, v5.4s
|
|
|
|
sqxtn v1.4h, v6.4s
|
|
|
|
sqxtn2 v1.8h, v7.4s
|
|
|
|
sqxtun v0.8b, v0.8h
|
|
|
|
sqxtun v1.8b, v1.8h
|
|
|
|
str s0, [x0], #4
|
|
|
|
st1 {v0.h}[2], [x0], x1
|
|
|
|
str s1, [x0], #4
|
|
|
|
st1 {v1.h}[2], [x0], x1
|
|
|
|
subs w4, w4, #2
|
|
|
|
b.ne 1b
|
|
|
|
ret
|
|
|
|
endfunc
|
|
|
|
|
|
|
|
function ff_hevc_put_hevc_pel_uni_w_pixels8_8_neon, export=1
|
|
|
|
mov w10, #-6
|
|
|
|
sub w10, w10, w5
|
|
|
|
dup v30.8h, w6
|
|
|
|
dup v31.4s, w10
|
|
|
|
dup v29.4s, w7
|
|
|
|
1:
|
|
|
|
ldr d0, [x2]
|
|
|
|
ldr d1, [x2, x3]
|
|
|
|
add x2, x2, x3, lsl #1
|
|
|
|
ushll v0.8h, v0.8b, #6
|
|
|
|
ushll v1.8h, v1.8b, #6
|
|
|
|
smull v4.4s, v0.4h, v30.4h
|
|
|
|
smull2 v5.4s, v0.8h, v30.8h
|
|
|
|
smull v6.4s, v1.4h, v30.4h
|
|
|
|
smull2 v7.4s, v1.8h, v30.8h
|
|
|
|
sqrshl v4.4s, v4.4s, v31.4s
|
|
|
|
sqrshl v5.4s, v5.4s, v31.4s
|
|
|
|
sqrshl v6.4s, v6.4s, v31.4s
|
|
|
|
sqrshl v7.4s, v7.4s, v31.4s
|
|
|
|
sqadd v4.4s, v4.4s, v29.4s
|
|
|
|
sqadd v5.4s, v5.4s, v29.4s
|
|
|
|
sqadd v6.4s, v6.4s, v29.4s
|
|
|
|
sqadd v7.4s, v7.4s, v29.4s
|
|
|
|
sqxtn v0.4h, v4.4s
|
|
|
|
sqxtn2 v0.8h, v5.4s
|
|
|
|
sqxtn v1.4h, v6.4s
|
|
|
|
sqxtn2 v1.8h, v7.4s
|
|
|
|
sqxtun v0.8b, v0.8h
|
|
|
|
sqxtun v1.8b, v1.8h
|
|
|
|
str d0, [x0]
|
|
|
|
str d1, [x0, x1]
|
|
|
|
add x0, x0, x1, lsl #1
|
|
|
|
subs w4, w4, #2
|
|
|
|
b.ne 1b
|
|
|
|
ret
|
|
|
|
endfunc
|
|
|
|
|
|
|
|
function ff_hevc_put_hevc_pel_uni_w_pixels12_8_neon, export=1
|
|
|
|
mov w10, #-6
|
|
|
|
sub w10, w10, w5
|
|
|
|
dup v30.8h, w6
|
|
|
|
dup v31.4s, w10
|
|
|
|
dup v29.4s, w7
|
|
|
|
sub x1, x1, #8
|
|
|
|
1:
|
|
|
|
ldr q0, [x2]
|
|
|
|
ldr q1, [x2, x3]
|
|
|
|
add x2, x2, x3, lsl #1
|
|
|
|
ushll v4.8h, v0.8b, #6
|
|
|
|
ushll2 v5.8h, v0.16b, #6
|
|
|
|
ushll v6.8h, v1.8b, #6
|
|
|
|
ushll2 v7.8h, v1.16b, #6
|
|
|
|
smull v16.4s, v4.4h, v30.4h
|
|
|
|
smull2 v17.4s, v4.8h, v30.8h
|
|
|
|
smull v18.4s, v5.4h, v30.4h
|
|
|
|
smull2 v19.4s, v5.8h, v30.8h
|
|
|
|
smull v20.4s, v6.4h, v30.4h
|
|
|
|
smull2 v21.4s, v6.8h, v30.8h
|
|
|
|
smull v22.4s, v7.4h, v30.4h
|
|
|
|
smull2 v23.4s, v7.8h, v30.8h
|
|
|
|
|
|
|
|
sqrshl v16.4s, v16.4s, v31.4s
|
|
|
|
sqrshl v17.4s, v17.4s, v31.4s
|
|
|
|
sqrshl v18.4s, v18.4s, v31.4s
|
|
|
|
sqrshl v19.4s, v19.4s, v31.4s
|
|
|
|
sqrshl v20.4s, v20.4s, v31.4s
|
|
|
|
sqrshl v21.4s, v21.4s, v31.4s
|
|
|
|
sqrshl v22.4s, v22.4s, v31.4s
|
|
|
|
sqrshl v23.4s, v23.4s, v31.4s
|
|
|
|
sqadd v16.4s, v16.4s, v29.4s
|
|
|
|
sqadd v17.4s, v17.4s, v29.4s
|
|
|
|
sqadd v18.4s, v18.4s, v29.4s
|
|
|
|
sqadd v19.4s, v19.4s, v29.4s
|
|
|
|
sqadd v20.4s, v20.4s, v29.4s
|
|
|
|
sqadd v21.4s, v21.4s, v29.4s
|
|
|
|
sqadd v22.4s, v22.4s, v29.4s
|
|
|
|
sqadd v23.4s, v23.4s, v29.4s
|
|
|
|
sqxtn v0.4h, v16.4s
|
|
|
|
sqxtn2 v0.8h, v17.4s
|
|
|
|
sqxtn v1.4h, v18.4s
|
|
|
|
sqxtn2 v1.8h, v19.4s
|
|
|
|
sqxtn v2.4h, v20.4s
|
|
|
|
sqxtn2 v2.8h, v21.4s
|
|
|
|
sqxtn v3.4h, v22.4s
|
|
|
|
sqxtn2 v3.8h, v23.4s
|
|
|
|
sqxtun v0.8b, v0.8h
|
|
|
|
sqxtun2 v0.16b, v1.8h
|
|
|
|
sqxtun v2.8b, v2.8h
|
|
|
|
sqxtun2 v2.16b, v3.8h
|
|
|
|
str d0, [x0], #8
|
|
|
|
st1 {v0.s}[2], [x0], x1
|
|
|
|
str d2, [x0], #8
|
|
|
|
st1 {v2.s}[2], [x0], x1
|
|
|
|
subs w4, w4, #2
|
|
|
|
b.ne 1b
|
|
|
|
ret
|
|
|
|
endfunc
|
|
|
|
|
|
|
|
.macro PEL_UNI_W_PIXEL_CALC s0, t0, t1, d0, d1, d2, d3
|
|
|
|
ushll \t0\().8h, \s0\().8b, #6
|
|
|
|
ushll2 \t1\().8h, \s0\().16b, #6
|
|
|
|
smull \d0\().4s, \t0\().4h, v30.4h
|
|
|
|
smull2 \d1\().4s, \t0\().8h, v30.8h
|
|
|
|
smull \d2\().4s, \t1\().4h, v30.4h
|
|
|
|
smull2 \d3\().4s, \t1\().8h, v30.8h
|
|
|
|
sqrshl \d0\().4s, \d0\().4s, v31.4s
|
|
|
|
sqrshl \d1\().4s, \d1\().4s, v31.4s
|
|
|
|
sqrshl \d2\().4s, \d2\().4s, v31.4s
|
|
|
|
sqrshl \d3\().4s, \d3\().4s, v31.4s
|
|
|
|
sqadd \d0\().4s, \d0\().4s, v29.4s
|
|
|
|
sqadd \d1\().4s, \d1\().4s, v29.4s
|
|
|
|
sqadd \d2\().4s, \d2\().4s, v29.4s
|
|
|
|
sqadd \d3\().4s, \d3\().4s, v29.4s
|
|
|
|
sqxtn \t0\().4h, \d0\().4s
|
|
|
|
sqxtn2 \t0\().8h, \d1\().4s
|
|
|
|
sqxtn \t1\().4h, \d2\().4s
|
|
|
|
sqxtn2 \t1\().8h, \d3\().4s
|
|
|
|
sqxtun \s0\().8b, \t0\().8h
|
|
|
|
sqxtun2 \s0\().16b, \t1\().8h
|
|
|
|
.endm
|
|
|
|
|
|
|
|
|
|
|
|
function ff_hevc_put_hevc_pel_uni_w_pixels16_8_neon, export=1
|
|
|
|
mov w10, #-6
|
|
|
|
sub w10, w10, w5
|
|
|
|
dup v30.8h, w6
|
|
|
|
dup v31.4s, w10
|
|
|
|
dup v29.4s, w7
|
|
|
|
1:
|
|
|
|
ldr q0, [x2]
|
|
|
|
ldr q1, [x2, x3]
|
|
|
|
add x2, x2, x3, lsl #1
|
|
|
|
PEL_UNI_W_PIXEL_CALC v0, v4, v5, v16, v17, v18, v19
|
|
|
|
PEL_UNI_W_PIXEL_CALC v1, v6, v7, v20, v21, v22, v23
|
|
|
|
str q0, [x0]
|
|
|
|
str q1, [x0, x1]
|
|
|
|
add x0, x0, x1, lsl #1
|
|
|
|
subs w4, w4, #2
|
|
|
|
b.ne 1b
|
|
|
|
ret
|
|
|
|
endfunc
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
function ff_hevc_put_hevc_pel_uni_w_pixels24_8_neon, export=1
|
|
|
|
mov w10, #-6
|
|
|
|
sub w10, w10, w5
|
|
|
|
dup v30.8h, w6
|
|
|
|
dup v31.4s, w10
|
|
|
|
dup v29.4s, w7
|
|
|
|
1:
|
|
|
|
ld1 {v0.16b, v1.16b}, [x2], x3
|
|
|
|
ushll v4.8h, v0.8b, #6
|
|
|
|
ushll2 v5.8h, v0.16b, #6
|
|
|
|
ushll v6.8h, v1.8b, #6
|
|
|
|
smull v16.4s, v4.4h, v30.4h
|
|
|
|
smull2 v17.4s, v4.8h, v30.8h
|
|
|
|
smull v18.4s, v5.4h, v30.4h
|
|
|
|
smull2 v19.4s, v5.8h, v30.8h
|
|
|
|
smull v20.4s, v6.4h, v30.4h
|
|
|
|
smull2 v21.4s, v6.8h, v30.8h
|
|
|
|
sqrshl v16.4s, v16.4s, v31.4s
|
|
|
|
sqrshl v17.4s, v17.4s, v31.4s
|
|
|
|
sqrshl v18.4s, v18.4s, v31.4s
|
|
|
|
sqrshl v19.4s, v19.4s, v31.4s
|
|
|
|
sqrshl v20.4s, v20.4s, v31.4s
|
|
|
|
sqrshl v21.4s, v21.4s, v31.4s
|
|
|
|
sqadd v16.4s, v16.4s, v29.4s
|
|
|
|
sqadd v17.4s, v17.4s, v29.4s
|
|
|
|
sqadd v18.4s, v18.4s, v29.4s
|
|
|
|
sqadd v19.4s, v19.4s, v29.4s
|
|
|
|
sqadd v20.4s, v20.4s, v29.4s
|
|
|
|
sqadd v21.4s, v21.4s, v29.4s
|
|
|
|
sqxtn v0.4h, v16.4s
|
|
|
|
sqxtn2 v0.8h, v17.4s
|
|
|
|
sqxtn v1.4h, v18.4s
|
|
|
|
sqxtn2 v1.8h, v19.4s
|
|
|
|
sqxtn v2.4h, v20.4s
|
|
|
|
sqxtn2 v2.8h, v21.4s
|
|
|
|
sqxtun v0.8b, v0.8h
|
|
|
|
sqxtun v1.8b, v1.8h
|
|
|
|
sqxtun v2.8b, v2.8h
|
|
|
|
st1 {v0.8b, v1.8b, v2.8b}, [x0], x1
|
|
|
|
subs w4, w4, #1
|
|
|
|
b.ne 1b
|
|
|
|
ret
|
|
|
|
endfunc
|
|
|
|
|
|
|
|
function ff_hevc_put_hevc_pel_uni_w_pixels32_8_neon, export=1
|
|
|
|
mov w10, #-6
|
|
|
|
sub w10, w10, w5
|
|
|
|
dup v30.8h, w6
|
|
|
|
dup v31.4s, w10
|
|
|
|
dup v29.4s, w7
|
|
|
|
1:
|
|
|
|
ld1 {v0.16b, v1.16b}, [x2], x3
|
|
|
|
PEL_UNI_W_PIXEL_CALC v0, v4, v5, v16, v17, v18, v19
|
|
|
|
PEL_UNI_W_PIXEL_CALC v1, v6, v7, v20, v21, v22, v23
|
|
|
|
st1 {v0.16b, v1.16b}, [x0], x1
|
|
|
|
subs w4, w4, #1
|
|
|
|
b.ne 1b
|
|
|
|
ret
|
|
|
|
endfunc
|
|
|
|
|
|
|
|
|
|
|
|
function ff_hevc_put_hevc_pel_uni_w_pixels48_8_neon, export=1
|
|
|
|
mov w10, #-6
|
|
|
|
sub w10, w10, w5
|
|
|
|
dup v30.8h, w6
|
|
|
|
dup v31.4s, w10
|
|
|
|
dup v29.4s, w7
|
|
|
|
1:
|
|
|
|
ld1 {v0.16b, v1.16b, v2.16b}, [x2], x3
|
|
|
|
PEL_UNI_W_PIXEL_CALC v0, v4, v5, v16, v17, v18, v19
|
|
|
|
PEL_UNI_W_PIXEL_CALC v1, v6, v7, v20, v21, v22, v23
|
|
|
|
PEL_UNI_W_PIXEL_CALC v2, v4, v5, v16, v17, v18, v19
|
|
|
|
st1 {v0.16b, v1.16b, v2.16b}, [x0], x1
|
|
|
|
subs w4, w4, #1
|
|
|
|
b.ne 1b
|
|
|
|
ret
|
|
|
|
endfunc
|
|
|
|
|
|
|
|
function ff_hevc_put_hevc_pel_uni_w_pixels64_8_neon, export=1
|
|
|
|
mov w10, #-6
|
|
|
|
sub w10, w10, w5
|
|
|
|
dup v30.8h, w6
|
|
|
|
dup v31.4s, w10
|
|
|
|
dup v29.4s, w7
|
|
|
|
1:
|
|
|
|
ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], x3
|
|
|
|
PEL_UNI_W_PIXEL_CALC v0, v4, v5, v16, v17, v18, v19
|
|
|
|
PEL_UNI_W_PIXEL_CALC v1, v6, v7, v20, v21, v22, v23
|
|
|
|
PEL_UNI_W_PIXEL_CALC v2, v4, v5, v16, v17, v18, v19
|
|
|
|
PEL_UNI_W_PIXEL_CALC v3, v6, v7, v20, v21, v22, v23
|
|
|
|
st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
|
|
|
|
subs w4, w4, #1
|
|
|
|
b.ne 1b
|
|
|
|
ret
|
|
|
|
endfunc
|
|
|
|
|
|
|
|
.macro QPEL_UNI_W_V_HEADER
|
|
|
|
ldur x12, [sp, #8] // my
|
|
|
|
sub x2, x2, x3, lsl #1
|
|
|
|
sub x2, x2, x3
|
|
|
|
movrel x9, qpel_filters_abs
|
|
|
|
add x9, x9, x12, lsl #3
|
|
|
|
ldr d28, [x9]
|
|
|
|
dup v0.16b, v28.b[0]
|
|
|
|
dup v1.16b, v28.b[1]
|
|
|
|
dup v2.16b, v28.b[2]
|
|
|
|
dup v3.16b, v28.b[3]
|
|
|
|
dup v4.16b, v28.b[4]
|
|
|
|
dup v5.16b, v28.b[5]
|
|
|
|
dup v6.16b, v28.b[6]
|
|
|
|
dup v7.16b, v28.b[7]
|
|
|
|
|
|
|
|
mov w10, #-6
|
|
|
|
sub w10, w10, w5
|
|
|
|
dup v30.8h, w6 // wx
|
|
|
|
dup v31.4s, w10 // shift
|
|
|
|
dup v29.4s, w7 // ox
|
|
|
|
.endm
|
|
|
|
|
|
|
|
.macro QPEL_FILTER_B dst, src0, src1, src2, src3, src4, src5, src6, src7
|
|
|
|
umull \dst\().8h, \src1\().8b, v1.8b
|
|
|
|
umlsl \dst\().8h, \src0\().8b, v0.8b
|
|
|
|
umlsl \dst\().8h, \src2\().8b, v2.8b
|
|
|
|
umlal \dst\().8h, \src3\().8b, v3.8b
|
|
|
|
umlal \dst\().8h, \src4\().8b, v4.8b
|
|
|
|
umlsl \dst\().8h, \src5\().8b, v5.8b
|
|
|
|
umlal \dst\().8h, \src6\().8b, v6.8b
|
|
|
|
umlsl \dst\().8h, \src7\().8b, v7.8b
|
|
|
|
.endm
|
|
|
|
|
|
|
|
.macro QPEL_FILTER_B2 dst, src0, src1, src2, src3, src4, src5, src6, src7
|
|
|
|
umull2 \dst\().8h, \src1\().16b, v1.16b
|
|
|
|
umlsl2 \dst\().8h, \src0\().16b, v0.16b
|
|
|
|
umlsl2 \dst\().8h, \src2\().16b, v2.16b
|
|
|
|
umlal2 \dst\().8h, \src3\().16b, v3.16b
|
|
|
|
umlal2 \dst\().8h, \src4\().16b, v4.16b
|
|
|
|
umlsl2 \dst\().8h, \src5\().16b, v5.16b
|
|
|
|
umlal2 \dst\().8h, \src6\().16b, v6.16b
|
|
|
|
umlsl2 \dst\().8h, \src7\().16b, v7.16b
|
|
|
|
.endm
|
|
|
|
|
|
|
|
.macro QPEL_UNI_W_V_4
|
|
|
|
smull v24.4s, v24.4h, v30.4h
|
|
|
|
sqrshl v24.4s, v24.4s, v31.4s
|
|
|
|
sqadd v24.4s, v24.4s, v29.4s
|
|
|
|
sqxtn v24.4h, v24.4s
|
|
|
|
sqxtun v24.8b, v24.8h
|
|
|
|
st1 {v24.s}[0], [x0], x1
|
|
|
|
.endm
|
|
|
|
|
|
|
|
function ff_hevc_put_hevc_qpel_uni_w_v4_8_neon, export=1
|
|
|
|
QPEL_UNI_W_V_HEADER
|
|
|
|
ldr s16, [x2]
|
|
|
|
ldr s17, [x2, x3]
|
|
|
|
add x2, x2, x3, lsl #1
|
|
|
|
ldr s18, [x2]
|
|
|
|
ldr s19, [x2, x3]
|
|
|
|
add x2, x2, x3, lsl #1
|
|
|
|
ldr s20, [x2]
|
|
|
|
ldr s21, [x2, x3]
|
|
|
|
add x2, x2, x3, lsl #1
|
|
|
|
ldr s22, [x2]
|
|
|
|
|
|
|
|
1: ldr s23, [x2, x3]
|
|
|
|
add x2, x2, x3, lsl #1
|
|
|
|
QPEL_FILTER_B v24, v16, v17, v18, v19, v20, v21, v22, v23
|
|
|
|
QPEL_UNI_W_V_4
|
|
|
|
subs w4, w4, #1
|
|
|
|
b.eq 2f
|
|
|
|
|
|
|
|
ldr s16, [x2]
|
|
|
|
QPEL_FILTER_B v24, v17, v18, v19, v20, v21, v22, v23, v16
|
|
|
|
QPEL_UNI_W_V_4
|
|
|
|
subs w4, w4, #1
|
|
|
|
b.eq 2f
|
|
|
|
|
|
|
|
ldr s17, [x2, x3]
|
|
|
|
add x2, x2, x3, lsl #1
|
|
|
|
QPEL_FILTER_B v24, v18, v19, v20, v21, v22, v23, v16, v17
|
|
|
|
QPEL_UNI_W_V_4
|
|
|
|
subs w4, w4, #1
|
|
|
|
b.eq 2f
|
|
|
|
|
|
|
|
ldr s18, [x2]
|
|
|
|
QPEL_FILTER_B v24, v19, v20, v21, v22, v23, v16, v17, v18
|
|
|
|
QPEL_UNI_W_V_4
|
|
|
|
subs w4, w4, #1
|
|
|
|
b.eq 2f
|
|
|
|
|
|
|
|
ldr s19, [x2, x3]
|
|
|
|
add x2, x2, x3, lsl #1
|
|
|
|
QPEL_FILTER_B v24, v20, v21, v22, v23, v16, v17, v18, v19
|
|
|
|
QPEL_UNI_W_V_4
|
|
|
|
subs w4, w4, #1
|
|
|
|
b.eq 2f
|
|
|
|
|
|
|
|
ldr s20, [x2]
|
|
|
|
QPEL_FILTER_B v24, v21, v22, v23, v16, v17, v18, v19, v20
|
|
|
|
QPEL_UNI_W_V_4
|
|
|
|
subs w4, w4, #1
|
|
|
|
b.eq 2f
|
|
|
|
|
|
|
|
ldr s21, [x2, x3]
|
|
|
|
add x2, x2, x3, lsl #1
|
|
|
|
QPEL_FILTER_B v24, v22, v23, v16, v17, v18, v19, v20, v21
|
|
|
|
QPEL_UNI_W_V_4
|
|
|
|
subs w4, w4, #1
|
|
|
|
b.eq 2f
|
|
|
|
|
|
|
|
ldr s22, [x2]
|
|
|
|
QPEL_FILTER_B v24, v23, v16, v17, v18, v19, v20, v21, v22
|
|
|
|
QPEL_UNI_W_V_4
|
|
|
|
subs w4, w4, #1
|
|
|
|
b.ne 1b
|
|
|
|
2:
|
|
|
|
ret
|
|
|
|
endfunc
|
|
|
|
|
|
|
|
.macro QPEL_UNI_W_V_8
|
|
|
|
smull v24.4s, v26.4h, v30.4h
|
|
|
|
smull2 v25.4s, v26.8h, v30.8h
|
|
|
|
sqrshl v24.4s, v24.4s, v31.4s
|
|
|
|
sqrshl v25.4s, v25.4s, v31.4s
|
|
|
|
sqadd v24.4s, v24.4s, v29.4s
|
|
|
|
sqadd v25.4s, v25.4s, v29.4s
|
|
|
|
sqxtn v24.4h, v24.4s
|
|
|
|
sqxtn2 v24.8h, v25.4s
|
|
|
|
sqxtun v24.8b, v24.8h
|
|
|
|
st1 {v24.d}[0], [x0], x1
|
|
|
|
.endm
|
|
|
|
|
|
|
|
function ff_hevc_put_hevc_qpel_uni_w_v8_8_neon, export=1
|
|
|
|
QPEL_UNI_W_V_HEADER
|
|
|
|
ldr d16, [x2]
|
|
|
|
ldr d17, [x2, x3]
|
|
|
|
add x2, x2, x3, lsl #1
|
|
|
|
ldr d18, [x2]
|
|
|
|
ldr d19, [x2, x3]
|
|
|
|
add x2, x2, x3, lsl #1
|
|
|
|
ldr d20, [x2]
|
|
|
|
ldr d21, [x2, x3]
|
|
|
|
add x2, x2, x3, lsl #1
|
|
|
|
ldr d22, [x2]
|
|
|
|
|
|
|
|
1: ldr d23, [x2, x3]
|
|
|
|
add x2, x2, x3, lsl #1
|
|
|
|
QPEL_FILTER_B v26, v16, v17, v18, v19, v20, v21, v22, v23
|
|
|
|
QPEL_UNI_W_V_8
|
|
|
|
subs w4, w4, #1
|
|
|
|
b.eq 2f
|
|
|
|
|
|
|
|
ldr d16, [x2]
|
|
|
|
QPEL_FILTER_B v26, v17, v18, v19, v20, v21, v22, v23, v16
|
|
|
|
QPEL_UNI_W_V_8
|
|
|
|
subs w4, w4, #1
|
|
|
|
b.eq 2f
|
|
|
|
|
|
|
|
ldr d17, [x2, x3]
|
|
|
|
add x2, x2, x3, lsl #1
|
|
|
|
QPEL_FILTER_B v26, v18, v19, v20, v21, v22, v23, v16, v17
|
|
|
|
QPEL_UNI_W_V_8
|
|
|
|
subs w4, w4, #1
|
|
|
|
b.eq 2f
|
|
|
|
|
|
|
|
ldr d18, [x2]
|
|
|
|
QPEL_FILTER_B v26, v19, v20, v21, v22, v23, v16, v17, v18
|
|
|
|
QPEL_UNI_W_V_8
|
|
|
|
subs w4, w4, #1
|
|
|
|
b.eq 2f
|
|
|
|
|
|
|
|
ldr d19, [x2, x3]
|
|
|
|
add x2, x2, x3, lsl #1
|
|
|
|
QPEL_FILTER_B v26, v20, v21, v22, v23, v16, v17, v18, v19
|
|
|
|
QPEL_UNI_W_V_8
|
|
|
|
subs w4, w4, #1
|
|
|
|
b.eq 2f
|
|
|
|
|
|
|
|
ldr d20, [x2]
|
|
|
|
QPEL_FILTER_B v26, v21, v22, v23, v16, v17, v18, v19, v20
|
|
|
|
QPEL_UNI_W_V_8
|
|
|
|
subs w4, w4, #1
|
|
|
|
b.eq 2f
|
|
|
|
|
|
|
|
ldr d21, [x2, x3]
|
|
|
|
add x2, x2, x3, lsl #1
|
|
|
|
QPEL_FILTER_B v26, v22, v23, v16, v17, v18, v19, v20, v21
|
|
|
|
QPEL_UNI_W_V_8
|
|
|
|
subs w4, w4, #1
|
|
|
|
b.eq 2f
|
|
|
|
|
|
|
|
ldr d22, [x2]
|
|
|
|
QPEL_FILTER_B v26, v23, v16, v17, v18, v19, v20, v21, v22
|
|
|
|
QPEL_UNI_W_V_8
|
|
|
|
subs w4, w4, #1
|
|
|
|
b.ne 1b
|
|
|
|
2:
|
|
|
|
ret
|
|
|
|
endfunc
|
|
|
|
|
|
|
|
.macro QPEL_UNI_W_V_16
|
|
|
|
smull v24.4s, v26.4h, v30.4h
|
|
|
|
smull2 v25.4s, v26.8h, v30.8h
|
|
|
|
smull v26.4s, v27.4h, v30.4h
|
|
|
|
smull2 v27.4s, v27.8h, v30.8h
|
|
|
|
sqrshl v24.4s, v24.4s, v31.4s
|
|
|
|
sqrshl v25.4s, v25.4s, v31.4s
|
|
|
|
sqrshl v26.4s, v26.4s, v31.4s
|
|
|
|
sqrshl v27.4s, v27.4s, v31.4s
|
|
|
|
sqadd v24.4s, v24.4s, v29.4s
|
|
|
|
sqadd v25.4s, v25.4s, v29.4s
|
|
|
|
sqadd v26.4s, v26.4s, v29.4s
|
|
|
|
sqadd v27.4s, v27.4s, v29.4s
|
|
|
|
sqxtn v24.4h, v24.4s
|
|
|
|
sqxtn2 v24.8h, v25.4s
|
|
|
|
sqxtn v26.4h, v26.4s
|
|
|
|
sqxtn2 v26.8h, v27.4s
|
|
|
|
sqxtun v24.8b, v24.8h
|
|
|
|
sqxtun2 v24.16b, v26.8h
|
|
|
|
st1 {v24.16b}, [x0], x1
|
|
|
|
.endm
|
|
|
|
|
|
|
|
function ff_hevc_put_hevc_qpel_uni_w_v16_8_neon, export=1
|
|
|
|
QPEL_UNI_W_V_HEADER
|
|
|
|
ldr q16, [x2]
|
|
|
|
ldr q17, [x2, x3]
|
|
|
|
add x2, x2, x3, lsl #1
|
|
|
|
ldr q18, [x2]
|
|
|
|
ldr q19, [x2, x3]
|
|
|
|
add x2, x2, x3, lsl #1
|
|
|
|
ldr q20, [x2]
|
|
|
|
ldr q21, [x2, x3]
|
|
|
|
add x2, x2, x3, lsl #1
|
|
|
|
ldr q22, [x2]
|
|
|
|
|
|
|
|
1: ldr q23, [x2, x3]
|
|
|
|
add x2, x2, x3, lsl #1
|
|
|
|
QPEL_FILTER_B v26, v16, v17, v18, v19, v20, v21, v22, v23
|
|
|
|
QPEL_FILTER_B2 v27, v16, v17, v18, v19, v20, v21, v22, v23
|
|
|
|
QPEL_UNI_W_V_16
|
|
|
|
subs w4, w4, #1
|
|
|
|
b.eq 2f
|
|
|
|
|
|
|
|
ldr q16, [x2]
|
|
|
|
QPEL_FILTER_B v26, v17, v18, v19, v20, v21, v22, v23, v16
|
|
|
|
QPEL_FILTER_B2 v27, v17, v18, v19, v20, v21, v22, v23, v16
|
|
|
|
QPEL_UNI_W_V_16
|
|
|
|
subs w4, w4, #1
|
|
|
|
b.eq 2f
|
|
|
|
|
|
|
|
ldr q17, [x2, x3]
|
|
|
|
add x2, x2, x3, lsl #1
|
|
|
|
QPEL_FILTER_B v26, v18, v19, v20, v21, v22, v23, v16, v17
|
|
|
|
QPEL_FILTER_B2 v27, v18, v19, v20, v21, v22, v23, v16, v17
|
|
|
|
QPEL_UNI_W_V_16
|
|
|
|
subs w4, w4, #1
|
|
|
|
b.eq 2f
|
|
|
|
|
|
|
|
ldr q18, [x2]
|
|
|
|
QPEL_FILTER_B v26, v19, v20, v21, v22, v23, v16, v17, v18
|
|
|
|
QPEL_FILTER_B2 v27, v19, v20, v21, v22, v23, v16, v17, v18
|
|
|
|
QPEL_UNI_W_V_16
|
|
|
|
subs w4, w4, #1
|
|
|
|
b.eq 2f
|
|
|
|
|
|
|
|
ldr q19, [x2, x3]
|
|
|
|
add x2, x2, x3, lsl #1
|
|
|
|
QPEL_FILTER_B v26, v20, v21, v22, v23, v16, v17, v18, v19
|
|
|
|
QPEL_FILTER_B2 v27, v20, v21, v22, v23, v16, v17, v18, v19
|
|
|
|
QPEL_UNI_W_V_16
|
|
|
|
subs w4, w4, #1
|
|
|
|
b.eq 2f
|
|
|
|
|
|
|
|
ldr q20, [x2]
|
|
|
|
QPEL_FILTER_B v26, v21, v22, v23, v16, v17, v18, v19, v20
|
|
|
|
QPEL_FILTER_B2 v27, v21, v22, v23, v16, v17, v18, v19, v20
|
|
|
|
QPEL_UNI_W_V_16
|
|
|
|
subs w4, w4, #1
|
|
|
|
b.eq 2f
|
|
|
|
|
|
|
|
ldr q21, [x2, x3]
|
|
|
|
add x2, x2, x3, lsl #1
|
|
|
|
QPEL_FILTER_B v26, v22, v23, v16, v17, v18, v19, v20, v21
|
|
|
|
QPEL_FILTER_B2 v27, v22, v23, v16, v17, v18, v19, v20, v21
|
|
|
|
QPEL_UNI_W_V_16
|
|
|
|
subs w4, w4, #1
|
|
|
|
b.eq 2f
|
|
|
|
|
|
|
|
ldr q22, [x2]
|
|
|
|
QPEL_FILTER_B v26, v23, v16, v17, v18, v19, v20, v21, v22
|
|
|
|
QPEL_FILTER_B2 v27, v23, v16, v17, v18, v19, v20, v21, v22
|
|
|
|
QPEL_UNI_W_V_16
|
|
|
|
subs w4, w4, #1
|
|
|
|
b.ne 1b
|
|
|
|
2:
|
|
|
|
ret
|
|
|
|
endfunc
|
|
|
|
|
|
|
|
function ff_hevc_put_hevc_qpel_uni_w_v64_8_neon, export=1
|
|
|
|
QPEL_UNI_W_V_HEADER
|
|
|
|
ldur w13, [sp, #16]
|
|
|
|
mov x14, x0
|
|
|
|
mov x15, x2
|
|
|
|
mov w11, w4
|
|
|
|
|
|
|
|
3:
|
|
|
|
ldr q16, [x2]
|
|
|
|
ldr q17, [x2, x3]
|
|
|
|
add x2, x2, x3, lsl #1
|
|
|
|
ldr q18, [x2]
|
|
|
|
ldr q19, [x2, x3]
|
|
|
|
add x2, x2, x3, lsl #1
|
|
|
|
ldr q20, [x2]
|
|
|
|
ldr q21, [x2, x3]
|
|
|
|
add x2, x2, x3, lsl #1
|
|
|
|
ldr q22, [x2]
|
|
|
|
|
|
|
|
|
|
|
|
1: ldr q23, [x2, x3]
|
|
|
|
add x2, x2, x3, lsl #1
|
|
|
|
QPEL_FILTER_B v26, v16, v17, v18, v19, v20, v21, v22, v23
|
|
|
|
QPEL_FILTER_B2 v27, v16, v17, v18, v19, v20, v21, v22, v23
|
|
|
|
QPEL_UNI_W_V_16
|
|
|
|
subs w4, w4, #1
|
|
|
|
b.eq 2f
|
|
|
|
|
|
|
|
ldr q16, [x2]
|
|
|
|
QPEL_FILTER_B v26, v17, v18, v19, v20, v21, v22, v23, v16
|
|
|
|
QPEL_FILTER_B2 v27, v17, v18, v19, v20, v21, v22, v23, v16
|
|
|
|
QPEL_UNI_W_V_16
|
|
|
|
subs w4, w4, #1
|
|
|
|
b.eq 2f
|
|
|
|
|
|
|
|
ldr q17, [x2, x3]
|
|
|
|
add x2, x2, x3, lsl #1
|
|
|
|
QPEL_FILTER_B v26, v18, v19, v20, v21, v22, v23, v16, v17
|
|
|
|
QPEL_FILTER_B2 v27, v18, v19, v20, v21, v22, v23, v16, v17
|
|
|
|
QPEL_UNI_W_V_16
|
|
|
|
subs w4, w4, #1
|
|
|
|
b.eq 2f
|
|
|
|
|
|
|
|
ldr q18, [x2]
|
|
|
|
QPEL_FILTER_B v26, v19, v20, v21, v22, v23, v16, v17, v18
|
|
|
|
QPEL_FILTER_B2 v27, v19, v20, v21, v22, v23, v16, v17, v18
|
|
|
|
QPEL_UNI_W_V_16
|
|
|
|
subs w4, w4, #1
|
|
|
|
b.eq 2f
|
|
|
|
|
|
|
|
ldr q19, [x2, x3]
|
|
|
|
add x2, x2, x3, lsl #1
|
|
|
|
QPEL_FILTER_B v26, v20, v21, v22, v23, v16, v17, v18, v19
|
|
|
|
QPEL_FILTER_B2 v27, v20, v21, v22, v23, v16, v17, v18, v19
|
|
|
|
QPEL_UNI_W_V_16
|
|
|
|
subs w4, w4, #1
|
|
|
|
b.eq 2f
|
|
|
|
|
|
|
|
ldr q20, [x2]
|
|
|
|
QPEL_FILTER_B v26, v21, v22, v23, v16, v17, v18, v19, v20
|
|
|
|
QPEL_FILTER_B2 v27, v21, v22, v23, v16, v17, v18, v19, v20
|
|
|
|
QPEL_UNI_W_V_16
|
|
|
|
subs w4, w4, #1
|
|
|
|
b.eq 2f
|
|
|
|
|
|
|
|
ldr q21, [x2, x3]
|
|
|
|
add x2, x2, x3, lsl #1
|
|
|
|
QPEL_FILTER_B v26, v22, v23, v16, v17, v18, v19, v20, v21
|
|
|
|
QPEL_FILTER_B2 v27, v22, v23, v16, v17, v18, v19, v20, v21
|
|
|
|
QPEL_UNI_W_V_16
|
|
|
|
subs w4, w4, #1
|
|
|
|
b.eq 2f
|
|
|
|
|
|
|
|
ldr q22, [x2]
|
|
|
|
QPEL_FILTER_B v26, v23, v16, v17, v18, v19, v20, v21, v22
|
|
|
|
QPEL_FILTER_B2 v27, v23, v16, v17, v18, v19, v20, v21, v22
|
|
|
|
QPEL_UNI_W_V_16
|
|
|
|
subs w4, w4, #1
|
|
|
|
b.ne 1b
|
|
|
|
2:
|
|
|
|
subs w13, w13, #16
|
|
|
|
add x14, x14, #16
|
|
|
|
add x15, x15, #16
|
|
|
|
mov x0, x14
|
|
|
|
mov x2, x15
|
|
|
|
mov w4, w11
|
|
|
|
b.hi 3b
|
|
|
|
ret
|
|
|
|
endfunc
|
|
|
|
|
|
|
|
#if HAVE_I8MM
|
|
|
|
ENABLE_I8MM
|
|
|
|
|
|
|
|
function ff_hevc_put_hevc_qpel_uni_hv4_8_neon_i8mm, export=1
|
|
|
|
add w10, w4, #7
|
|
|
|
lsl x10, x10, #7
|
|
|
|
sub sp, sp, x10 // tmp_array
|
|
|
|
str x30, [sp, #-48]!
|
|
|
|
stp x4, x6, [sp, #16]
|
|
|
|
stp x0, x1, [sp, #32]
|
|
|
|
sub x1, x2, x3, lsl #1
|
|
|
|
sub x1, x1, x3
|
|
|
|
add x0, sp, #48
|
|
|
|
mov x2, x3
|
|
|
|
add x3, x4, #7
|
|
|
|
mov x4, x5
|
|
|
|
bl X(ff_hevc_put_hevc_qpel_h4_8_neon_i8mm)
|
|
|
|
ldp x4, x6, [sp, #16]
|
|
|
|
ldp x0, x1, [sp, #32]
|
|
|
|
ldr x30, [sp], #48
|
|
|
|
mov x9, #(MAX_PB_SIZE * 2)
|
|
|
|
load_qpel_filterh x6, x5
|
|
|
|
ldr d16, [sp]
|
|
|
|
ldr d17, [sp, x9]
|
|
|
|
add sp, sp, x9, lsl #1
|
|
|
|
ldr d18, [sp]
|
|
|
|
ldr d19, [sp, x9]
|
|
|
|
add sp, sp, x9, lsl #1
|
|
|
|
ldr d20, [sp]
|
|
|
|
ldr d21, [sp, x9]
|
|
|
|
add sp, sp, x9, lsl #1
|
|
|
|
ldr d22, [sp]
|
|
|
|
add sp, sp, x9
|
|
|
|
.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
|
|
|
|
ld1 {\tmp\().4h}, [sp], x9
|
|
|
|
calc_qpelh v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqrshrn, #12
|
|
|
|
sqxtun v1.8b, v1.8h
|
|
|
|
subs w4, w4, #1
|
|
|
|
st1 {v1.s}[0], [x0], x1
|
|
|
|
.endm
|
|
|
|
1: calc_all
|
|
|
|
.purgem calc
|
|
|
|
2: ret
|
|
|
|
endfunc
|
|
|
|
|
|
|
|
function ff_hevc_put_hevc_qpel_uni_hv6_8_neon_i8mm, export=1
|
|
|
|
add w10, w4, #7
|
|
|
|
lsl x10, x10, #7
|
|
|
|
sub sp, sp, x10 // tmp_array
|
|
|
|
str x30, [sp, #-48]!
|
|
|
|
stp x4, x6, [sp, #16]
|
|
|
|
stp x0, x1, [sp, #32]
|
|
|
|
sub x1, x2, x3, lsl #1
|
|
|
|
sub x1, x1, x3
|
|
|
|
add x0, sp, #48
|
|
|
|
mov x2, x3
|
|
|
|
add w3, w4, #7
|
|
|
|
mov x4, x5
|
|
|
|
bl X(ff_hevc_put_hevc_qpel_h6_8_neon_i8mm)
|
|
|
|
ldp x4, x6, [sp, #16]
|
|
|
|
ldp x0, x1, [sp, #32]
|
|
|
|
ldr x30, [sp], #48
|
|
|
|
mov x9, #(MAX_PB_SIZE * 2)
|
|
|
|
load_qpel_filterh x6, x5
|
|
|
|
sub x1, x1, #4
|
|
|
|
ldr q16, [sp]
|
|
|
|
ldr q17, [sp, x9]
|
|
|
|
add sp, sp, x9, lsl #1
|
|
|
|
ldr q18, [sp]
|
|
|
|
ldr q19, [sp, x9]
|
|
|
|
add sp, sp, x9, lsl #1
|
|
|
|
ldr q20, [sp]
|
|
|
|
ldr q21, [sp, x9]
|
|
|
|
add sp, sp, x9, lsl #1
|
|
|
|
ldr q22, [sp]
|
|
|
|
add sp, sp, x9
|
|
|
|
.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
|
|
|
|
ld1 {\tmp\().8h}, [sp], x9
|
|
|
|
calc_qpelh v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqrshrn, #12
|
|
|
|
calc_qpelh2 v1, v2, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqrshrn2, #12
|
|
|
|
sqxtun v1.8b, v1.8h
|
|
|
|
st1 {v1.s}[0], [x0], #4
|
|
|
|
subs w4, w4, #1
|
|
|
|
st1 {v1.h}[2], [x0], x1
|
|
|
|
.endm
|
|
|
|
1: calc_all
|
|
|
|
.purgem calc
|
|
|
|
2: ret
|
|
|
|
endfunc
|
|
|
|
|
|
|
|
function ff_hevc_put_hevc_qpel_uni_hv8_8_neon_i8mm, export=1
|
|
|
|
add w10, w4, #7
|
|
|
|
lsl x10, x10, #7
|
|
|
|
sub sp, sp, x10 // tmp_array
|
|
|
|
str x30, [sp, #-48]!
|
|
|
|
stp x4, x6, [sp, #16]
|
|
|
|
stp x0, x1, [sp, #32]
|
|
|
|
sub x1, x2, x3, lsl #1
|
|
|
|
sub x1, x1, x3
|
|
|
|
add x0, sp, #48
|
|
|
|
mov x2, x3
|
|
|
|
add w3, w4, #7
|
|
|
|
mov x4, x5
|
|
|
|
bl X(ff_hevc_put_hevc_qpel_h8_8_neon_i8mm)
|
|
|
|
ldp x4, x6, [sp, #16]
|
|
|
|
ldp x0, x1, [sp, #32]
|
|
|
|
ldr x30, [sp], #48
|
|
|
|
mov x9, #(MAX_PB_SIZE * 2)
|
|
|
|
load_qpel_filterh x6, x5
|
|
|
|
ldr q16, [sp]
|
|
|
|
ldr q17, [sp, x9]
|
|
|
|
add sp, sp, x9, lsl #1
|
|
|
|
ldr q18, [sp]
|
|
|
|
ldr q19, [sp, x9]
|
|
|
|
add sp, sp, x9, lsl #1
|
|
|
|
ldr q20, [sp]
|
|
|
|
ldr q21, [sp, x9]
|
|
|
|
add sp, sp, x9, lsl #1
|
|
|
|
ldr q22, [sp]
|
|
|
|
add sp, sp, x9
|
|
|
|
.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
|
|
|
|
ld1 {\tmp\().8h}, [sp], x9
|
|
|
|
calc_qpelh v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqrshrn, #12
|
|
|
|
calc_qpelh2 v1, v2, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqrshrn2, #12
|
|
|
|
sqxtun v1.8b, v1.8h
|
|
|
|
subs w4, w4, #1
|
|
|
|
st1 {v1.8b}, [x0], x1
|
|
|
|
.endm
|
|
|
|
1: calc_all
|
|
|
|
.purgem calc
|
|
|
|
2: ret
|
|
|
|
endfunc
|
|
|
|
|
|
|
|
function ff_hevc_put_hevc_qpel_uni_hv12_8_neon_i8mm, export=1
|
|
|
|
add w10, w4, #7
|
|
|
|
lsl x10, x10, #7
|
|
|
|
sub sp, sp, x10 // tmp_array
|
|
|
|
stp x7, x30, [sp, #-48]!
|
|
|
|
stp x4, x6, [sp, #16]
|
|
|
|
stp x0, x1, [sp, #32]
|
|
|
|
sub x1, x2, x3, lsl #1
|
|
|
|
sub x1, x1, x3
|
|
|
|
mov x2, x3
|
|
|
|
add x0, sp, #48
|
|
|
|
add w3, w4, #7
|
|
|
|
mov x4, x5
|
|
|
|
bl X(ff_hevc_put_hevc_qpel_h12_8_neon_i8mm)
|
|
|
|
ldp x4, x6, [sp, #16]
|
|
|
|
ldp x0, x1, [sp, #32]
|
|
|
|
ldp x7, x30, [sp], #48
|
|
|
|
mov x9, #(MAX_PB_SIZE * 2)
|
|
|
|
load_qpel_filterh x6, x5
|
|
|
|
sub x1, x1, #8
|
|
|
|
ld1 {v16.8h, v17.8h}, [sp], x9
|
|
|
|
ld1 {v18.8h, v19.8h}, [sp], x9
|
|
|
|
ld1 {v20.8h, v21.8h}, [sp], x9
|
|
|
|
ld1 {v22.8h, v23.8h}, [sp], x9
|
|
|
|
ld1 {v24.8h, v25.8h}, [sp], x9
|
|
|
|
ld1 {v26.8h, v27.8h}, [sp], x9
|
|
|
|
ld1 {v28.8h, v29.8h}, [sp], x9
|
|
|
|
.macro calc tmp0, tmp1, src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11, src12, src13, src14, src15
|
|
|
|
ld1 {\tmp0\().8h, \tmp1\().8h}, [sp], x9
|
|
|
|
calc_qpelh v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqrshrn, #12
|
|
|
|
calc_qpelh2 v1, v2, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqrshrn2, #12
|
|
|
|
calc_qpelh v2, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15, sqrshrn, #12
|
|
|
|
sqxtun v1.8b, v1.8h
|
|
|
|
sqxtun2 v1.16b, v2.8h
|
|
|
|
st1 {v1.8b}, [x0], #8
|
|
|
|
subs w4, w4, #1
|
|
|
|
st1 {v1.s}[2], [x0], x1
|
|
|
|
.endm
|
|
|
|
1: calc_all2
|
|
|
|
.purgem calc
|
|
|
|
2: ret
|
|
|
|
endfunc
|
|
|
|
|
|
|
|
function ff_hevc_put_hevc_qpel_uni_hv16_8_neon_i8mm, export=1
|
|
|
|
add w10, w4, #7
|
|
|
|
lsl x10, x10, #7
|
|
|
|
sub sp, sp, x10 // tmp_array
|
|
|
|
stp x7, x30, [sp, #-48]!
|
|
|
|
stp x4, x6, [sp, #16]
|
|
|
|
stp x0, x1, [sp, #32]
|
|
|
|
add x0, sp, #48
|
|
|
|
sub x1, x2, x3, lsl #1
|
|
|
|
sub x1, x1, x3
|
|
|
|
mov x2, x3
|
|
|
|
add w3, w4, #7
|
|
|
|
mov x4, x5
|
|
|
|
bl X(ff_hevc_put_hevc_qpel_h16_8_neon_i8mm)
|
|
|
|
ldp x4, x6, [sp, #16]
|
|
|
|
ldp x0, x1, [sp, #32]
|
|
|
|
ldp x7, x30, [sp], #48
|
|
|
|
.Lqpel_uni_hv16_loop:
|
|
|
|
mov x9, #(MAX_PB_SIZE * 2)
|
|
|
|
load_qpel_filterh x6, x5
|
|
|
|
sub w12, w9, w7, lsl #1
|
|
|
|
0: mov x8, sp // src
|
|
|
|
ld1 {v16.8h, v17.8h}, [x8], x9
|
|
|
|
mov w11, w4 // height
|
|
|
|
ld1 {v18.8h, v19.8h}, [x8], x9
|
|
|
|
mov x10, x0 // dst
|
|
|
|
ld1 {v20.8h, v21.8h}, [x8], x9
|
|
|
|
ld1 {v22.8h, v23.8h}, [x8], x9
|
|
|
|
ld1 {v24.8h, v25.8h}, [x8], x9
|
|
|
|
ld1 {v26.8h, v27.8h}, [x8], x9
|
|
|
|
ld1 {v28.8h, v29.8h}, [x8], x9
|
|
|
|
.macro calc tmp0, tmp1, src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11, src12, src13, src14, src15
|
|
|
|
ld1 {\tmp0\().8h, \tmp1\().8h}, [x8], x9
|
|
|
|
calc_qpelh v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqrshrn, #12
|
|
|
|
calc_qpelh2 v1, v2, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqrshrn2, #12
|
|
|
|
calc_qpelh v2, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15, sqrshrn, #12
|
|
|
|
calc_qpelh2 v2, v3, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15, sqrshrn2, #12
|
|
|
|
sqxtun v1.8b, v1.8h
|
|
|
|
subs x11, x11, #1
|
|
|
|
sqxtun2 v1.16b, v2.8h
|
|
|
|
st1 {v1.16b}, [x10], x1
|
|
|
|
.endm
|
|
|
|
1: calc_all2
|
|
|
|
.purgem calc
|
|
|
|
2: add x0, x0, #16
|
|
|
|
add sp, sp, #32
|
|
|
|
subs w7, w7, #16
|
|
|
|
b.ne 0b
|
|
|
|
add w10, w4, #6
|
|
|
|
add sp, sp, x12 // discard rest of first line
|
|
|
|
lsl x10, x10, #7
|
|
|
|
add sp, sp, x10 // tmp_array without first line
|
|
|
|
ret
|
|
|
|
endfunc
|
|
|
|
|
|
|
|
function ff_hevc_put_hevc_qpel_uni_hv24_8_neon_i8mm, export=1
|
|
|
|
stp x4, x5, [sp, #-64]!
|
|
|
|
stp x2, x3, [sp, #16]
|
|
|
|
stp x0, x1, [sp, #32]
|
|
|
|
stp x6, x30, [sp, #48]
|
|
|
|
mov x7, #16
|
|
|
|
bl X(ff_hevc_put_hevc_qpel_uni_hv16_8_neon_i8mm)
|
|
|
|
ldp x2, x3, [sp, #16]
|
|
|
|
add x2, x2, #16
|
|
|
|
ldp x0, x1, [sp, #32]
|
|
|
|
ldp x4, x5, [sp], #48
|
|
|
|
mov x7, #8
|
|
|
|
add x0, x0, #16
|
|
|
|
ldr x6, [sp]
|
|
|
|
bl X(ff_hevc_put_hevc_qpel_uni_hv8_8_neon_i8mm)
|
|
|
|
ldr x30, [sp, #8]
|
|
|
|
add sp, sp, #16
|
|
|
|
ret
|
|
|
|
endfunc
|
|
|
|
|
|
|
|
function ff_hevc_put_hevc_qpel_uni_hv32_8_neon_i8mm, export=1
|
|
|
|
add w10, w4, #7
|
|
|
|
lsl x10, x10, #7
|
|
|
|
sub sp, sp, x10 // tmp_array
|
|
|
|
stp x7, x30, [sp, #-48]!
|
|
|
|
stp x4, x6, [sp, #16]
|
|
|
|
stp x0, x1, [sp, #32]
|
|
|
|
sub x1, x2, x3, lsl #1
|
|
|
|
add x0, sp, #48
|
|
|
|
sub x1, x1, x3
|
|
|
|
mov x2, x3
|
|
|
|
add w3, w4, #7
|
|
|
|
mov x4, x5
|
|
|
|
bl X(ff_hevc_put_hevc_qpel_h32_8_neon_i8mm)
|
|
|
|
ldp x4, x6, [sp, #16]
|
|
|
|
ldp x0, x1, [sp, #32]
|
|
|
|
ldp x7, x30, [sp], #48
|
|
|
|
b .Lqpel_uni_hv16_loop
|
|
|
|
endfunc
|
|
|
|
|
|
|
|
function ff_hevc_put_hevc_qpel_uni_hv48_8_neon_i8mm, export=1
|
|
|
|
add w10, w4, #7
|
|
|
|
lsl x10, x10, #7
|
|
|
|
sub sp, sp, x10 // tmp_array
|
|
|
|
stp x7, x30, [sp, #-48]!
|
|
|
|
stp x4, x6, [sp, #16]
|
|
|
|
stp x0, x1, [sp, #32]
|
|
|
|
sub x1, x2, x3, lsl #1
|
|
|
|
sub x1, x1, x3
|
|
|
|
mov x2, x3
|
|
|
|
add x0, sp, #48
|
|
|
|
add w3, w4, #7
|
|
|
|
mov x4, x5
|
|
|
|
bl X(ff_hevc_put_hevc_qpel_h48_8_neon_i8mm)
|
|
|
|
ldp x4, x6, [sp, #16]
|
|
|
|
ldp x0, x1, [sp, #32]
|
|
|
|
ldp x7, x30, [sp], #48
|
|
|
|
b .Lqpel_uni_hv16_loop
|
|
|
|
endfunc
|
|
|
|
|
|
|
|
function ff_hevc_put_hevc_qpel_uni_hv64_8_neon_i8mm, export=1
|
|
|
|
add w10, w4, #7
|
|
|
|
lsl x10, x10, #7
|
|
|
|
sub sp, sp, x10 // tmp_array
|
|
|
|
stp x7, x30, [sp, #-48]!
|
|
|
|
stp x4, x6, [sp, #16]
|
|
|
|
stp x0, x1, [sp, #32]
|
|
|
|
add x0, sp, #48
|
|
|
|
sub x1, x2, x3, lsl #1
|
|
|
|
mov x2, x3
|
|
|
|
sub x1, x1, x3
|
|
|
|
add w3, w4, #7
|
|
|
|
mov x4, x5
|
|
|
|
bl X(ff_hevc_put_hevc_qpel_h64_8_neon_i8mm)
|
|
|
|
ldp x4, x6, [sp, #16]
|
|
|
|
ldp x0, x1, [sp, #32]
|
|
|
|
ldp x7, x30, [sp], #48
|
|
|
|
b .Lqpel_uni_hv16_loop
|
|
|
|
endfunc
|
|
|
|
|
|
|
|
.macro QPEL_UNI_W_H_HEADER
|
|
|
|
ldr x12, [sp]
|
|
|
|
sub x2, x2, #3
|
|
|
|
movrel x9, qpel_filters
|
|
|
|
add x9, x9, x12, lsl #3
|
|
|
|
ld1r {v28.2d}, [x9]
|
|
|
|
mov w10, #-6
|
|
|
|
sub w10, w10, w5
|
|
|
|
dup v30.4s, w6 // wx
|
|
|
|
dup v31.4s, w10 // shift
|
|
|
|
dup v29.4s, w7 // ox
|
|
|
|
.endm
|
|
|
|
|
|
|
|
function ff_hevc_put_hevc_qpel_uni_w_h4_8_neon_i8mm, export=1
|
|
|
|
QPEL_UNI_W_H_HEADER
|
|
|
|
1:
|
|
|
|
ld1 {v0.16b}, [x2], x3
|
|
|
|
ext v1.16b, v0.16b, v0.16b, #1
|
|
|
|
ext v2.16b, v0.16b, v0.16b, #2
|
|
|
|
ext v3.16b, v0.16b, v0.16b, #3
|
|
|
|
zip1 v0.2d, v0.2d, v1.2d
|
|
|
|
zip1 v2.2d, v2.2d, v3.2d
|
|
|
|
movi v16.16b, #0
|
|
|
|
movi v17.16b, #0
|
|
|
|
usdot v16.4s, v0.16b, v28.16b
|
|
|
|
usdot v17.4s, v2.16b, v28.16b
|
|
|
|
addp v16.4s, v16.4s, v17.4s
|
|
|
|
mul v16.4s, v16.4s, v30.4s
|
|
|
|
sqrshl v16.4s, v16.4s, v31.4s
|
|
|
|
sqadd v16.4s, v16.4s, v29.4s
|
|
|
|
sqxtn v16.4h, v16.4s
|
|
|
|
sqxtun v16.8b, v16.8h
|
|
|
|
str s16, [x0]
|
|
|
|
add x0, x0, x1
|
|
|
|
subs w4, w4, #1
|
|
|
|
b.hi 1b
|
|
|
|
ret
|
|
|
|
endfunc
|
|
|
|
|
|
|
|
function ff_hevc_put_hevc_qpel_uni_w_h6_8_neon_i8mm, export=1
|
|
|
|
QPEL_UNI_W_H_HEADER
|
|
|
|
sub x1, x1, #4
|
|
|
|
1:
|
|
|
|
ld1 {v0.16b}, [x2], x3
|
|
|
|
ext v1.16b, v0.16b, v0.16b, #1
|
|
|
|
ext v2.16b, v0.16b, v0.16b, #2
|
|
|
|
ext v3.16b, v0.16b, v0.16b, #3
|
|
|
|
ext v4.16b, v0.16b, v0.16b, #4
|
|
|
|
ext v5.16b, v0.16b, v0.16b, #5
|
|
|
|
zip1 v0.2d, v0.2d, v1.2d
|
|
|
|
zip1 v2.2d, v2.2d, v3.2d
|
|
|
|
zip1 v4.2d, v4.2d, v5.2d
|
|
|
|
movi v16.16b, #0
|
|
|
|
movi v17.16b, #0
|
|
|
|
movi v18.16b, #0
|
|
|
|
usdot v16.4s, v0.16b, v28.16b
|
|
|
|
usdot v17.4s, v2.16b, v28.16b
|
|
|
|
usdot v18.4s, v4.16b, v28.16b
|
|
|
|
addp v16.4s, v16.4s, v17.4s
|
|
|
|
addp v18.4s, v18.4s, v18.4s
|
|
|
|
mul v16.4s, v16.4s, v30.4s
|
|
|
|
mul v18.2s, v18.2s, v30.2s
|
|
|
|
sqrshl v16.4s, v16.4s, v31.4s
|
|
|
|
sqrshl v18.2s, v18.2s, v31.2s
|
|
|
|
sqadd v16.4s, v16.4s, v29.4s
|
|
|
|
sqadd v18.2s, v18.2s, v29.2s
|
|
|
|
sqxtn v16.4h, v16.4s
|
|
|
|
sqxtn2 v16.8h, v18.4s
|
|
|
|
sqxtun v16.8b, v16.8h
|
|
|
|
str s16, [x0], #4
|
|
|
|
st1 {v16.h}[2], [x0], x1
|
|
|
|
subs w4, w4, #1
|
|
|
|
b.hi 1b
|
|
|
|
ret
|
|
|
|
endfunc
|
|
|
|
|
|
|
|
|
|
|
|
.macro QPEL_UNI_W_H_CALC s0, s1, s2, s3, d0, d1, d2, d3
|
|
|
|
movi \d0\().16b, #0
|
|
|
|
movi \d1\().16b, #0
|
|
|
|
movi \d2\().16b, #0
|
|
|
|
movi \d3\().16b, #0
|
|
|
|
usdot \d0\().4s, \s0\().16b, v28.16b
|
|
|
|
usdot \d1\().4s, \s1\().16b, v28.16b
|
|
|
|
usdot \d2\().4s, \s2\().16b, v28.16b
|
|
|
|
usdot \d3\().4s, \s3\().16b, v28.16b
|
|
|
|
addp \d0\().4s, \d0\().4s, \d1\().4s
|
|
|
|
addp \d2\().4s, \d2\().4s, \d3\().4s
|
|
|
|
mul \d0\().4s, \d0\().4s, v30.4s
|
|
|
|
mul \d2\().4s, \d2\().4s, v30.4s
|
|
|
|
sqrshl \d0\().4s, \d0\().4s, v31.4s
|
|
|
|
sqrshl \d2\().4s, \d2\().4s, v31.4s
|
|
|
|
sqadd \d0\().4s, \d0\().4s, v29.4s
|
|
|
|
sqadd \d2\().4s, \d2\().4s, v29.4s
|
|
|
|
.endm
|
|
|
|
|
|
|
|
.macro QPEL_UNI_W_H_CALC_HALF s0, s1, d0, d1
|
|
|
|
movi \d0\().16b, #0
|
|
|
|
movi \d1\().16b, #0
|
|
|
|
usdot \d0\().4s, \s0\().16b, v28.16b
|
|
|
|
usdot \d1\().4s, \s1\().16b, v28.16b
|
|
|
|
addp \d0\().4s, \d0\().4s, \d1\().4s
|
|
|
|
mul \d0\().4s, \d0\().4s, v30.4s
|
|
|
|
sqrshl \d0\().4s, \d0\().4s, v31.4s
|
|
|
|
sqadd \d0\().4s, \d0\().4s, v29.4s
|
|
|
|
.endm
|
|
|
|
|
|
|
|
|
|
|
|
function ff_hevc_put_hevc_qpel_uni_w_h8_8_neon_i8mm, export=1
|
|
|
|
QPEL_UNI_W_H_HEADER
|
|
|
|
1:
|
|
|
|
ld1 {v16.16b, v17.16b}, [x2], x3
|
|
|
|
ext v1.16b, v16.16b, v17.16b, #1
|
|
|
|
ext v2.16b, v16.16b, v17.16b, #2
|
|
|
|
ext v3.16b, v16.16b, v17.16b, #3
|
|
|
|
ext v4.16b, v16.16b, v17.16b, #4
|
|
|
|
ext v5.16b, v16.16b, v17.16b, #5
|
|
|
|
ext v6.16b, v16.16b, v17.16b, #6
|
|
|
|
ext v7.16b, v16.16b, v17.16b, #7
|
|
|
|
zip1 v0.2d, v16.2d, v1.2d
|
|
|
|
zip1 v2.2d, v2.2d, v3.2d
|
|
|
|
zip1 v4.2d, v4.2d, v5.2d
|
|
|
|
zip1 v6.2d, v6.2d, v7.2d
|
|
|
|
QPEL_UNI_W_H_CALC v0, v2, v4, v6, v18, v19, v20, v21
|
|
|
|
sqxtn v18.4h, v18.4s
|
|
|
|
sqxtn2 v18.8h, v20.4s
|
|
|
|
sqxtun v18.8b, v18.8h
|
|
|
|
str d18, [x0]
|
|
|
|
add x0, x0, x1
|
|
|
|
subs w4, w4, #1
|
|
|
|
b.hi 1b
|
|
|
|
ret
|
|
|
|
endfunc
|
|
|
|
|
|
|
|
function ff_hevc_put_hevc_qpel_uni_w_h12_8_neon_i8mm, export=1
|
|
|
|
QPEL_UNI_W_H_HEADER
|
|
|
|
add x13, x0, #8
|
|
|
|
1:
|
|
|
|
ld1 {v16.16b, v17.16b}, [x2], x3
|
|
|
|
ext v1.16b, v16.16b, v17.16b, #1
|
|
|
|
ext v2.16b, v16.16b, v17.16b, #2
|
|
|
|
ext v3.16b, v16.16b, v17.16b, #3
|
|
|
|
ext v4.16b, v16.16b, v17.16b, #4
|
|
|
|
ext v5.16b, v16.16b, v17.16b, #5
|
|
|
|
ext v6.16b, v16.16b, v17.16b, #6
|
|
|
|
ext v7.16b, v16.16b, v17.16b, #7
|
|
|
|
zip1 v18.2d, v16.2d, v1.2d
|
|
|
|
zip1 v19.2d, v2.2d, v3.2d
|
|
|
|
zip1 v20.2d, v4.2d, v5.2d
|
|
|
|
zip1 v21.2d, v6.2d, v7.2d
|
|
|
|
zip2 v22.2d, v16.2d, v1.2d
|
|
|
|
zip2 v23.2d, v2.2d, v3.2d
|
|
|
|
QPEL_UNI_W_H_CALC v18, v19, v20, v21, v0, v2, v4, v6
|
|
|
|
QPEL_UNI_W_H_CALC_HALF v22, v23, v24, v25
|
|
|
|
sqxtn v0.4h, v0.4s
|
|
|
|
sqxtn2 v0.8h, v4.4s
|
|
|
|
sqxtn v1.4h, v24.4s
|
|
|
|
sqxtun v0.8b, v0.8h
|
|
|
|
sqxtun v1.8b, v1.8h
|
|
|
|
|
|
|
|
str d0, [x0]
|
|
|
|
str s1, [x13]
|
|
|
|
add x0, x0, x1
|
|
|
|
add x13, x13, x1
|
|
|
|
subs w4, w4, #1
|
|
|
|
b.hi 1b
|
|
|
|
ret
|
|
|
|
endfunc
|
|
|
|
|
|
|
|
function ff_hevc_put_hevc_qpel_uni_w_h16_8_neon_i8mm, export=1
|
|
|
|
QPEL_UNI_W_H_HEADER
|
|
|
|
1:
|
|
|
|
ld1 {v16.16b, v17.16b}, [x2], x3
|
|
|
|
ext v1.16b, v16.16b, v17.16b, #1
|
|
|
|
ext v2.16b, v16.16b, v17.16b, #2
|
|
|
|
ext v3.16b, v16.16b, v17.16b, #3
|
|
|
|
ext v4.16b, v16.16b, v17.16b, #4
|
|
|
|
ext v5.16b, v16.16b, v17.16b, #5
|
|
|
|
ext v6.16b, v16.16b, v17.16b, #6
|
|
|
|
ext v7.16b, v16.16b, v17.16b, #7
|
|
|
|
QPEL_UNI_W_H_CALC v16, v2, v1, v3, v18, v19, v20, v21 // v18: 0, 8, 2, 10 v20: 1, 9, 3, 11
|
|
|
|
QPEL_UNI_W_H_CALC v4, v6, v5, v7, v22, v23, v24, v25 // v22: 4, 12, 6, 14 v24: 5, 13, 7, 15
|
|
|
|
sqxtn v0.4h, v18.4s
|
|
|
|
sqxtn2 v0.8h, v22.4s
|
|
|
|
sqxtn v1.4h, v20.4s
|
|
|
|
sqxtn2 v1.8h, v24.4s
|
|
|
|
trn1 v2.8h, v0.8h, v1.8h
|
|
|
|
trn2 v3.8h, v0.8h, v1.8h
|
|
|
|
sqxtun v0.8b, v2.8h
|
|
|
|
sqxtun2 v0.16b, v3.8h
|
|
|
|
st1 {v0.16b}, [x0], x1
|
|
|
|
subs w4, w4, #1
|
|
|
|
b.hi 1b
|
|
|
|
ret
|
|
|
|
endfunc
|
|
|
|
|
|
|
|
function ff_hevc_put_hevc_qpel_uni_w_h24_8_neon_i8mm, export=1
|
|
|
|
QPEL_UNI_W_H_HEADER
|
|
|
|
sub x1, x1, #16
|
|
|
|
1:
|
|
|
|
ld1 {v16.16b, v17.16b}, [x2], x3
|
|
|
|
ext v1.16b, v16.16b, v17.16b, #1
|
|
|
|
ext v2.16b, v16.16b, v17.16b, #2
|
|
|
|
ext v3.16b, v16.16b, v17.16b, #3
|
|
|
|
ext v4.16b, v16.16b, v17.16b, #4
|
|
|
|
ext v5.16b, v16.16b, v17.16b, #5
|
|
|
|
ext v6.16b, v16.16b, v17.16b, #6
|
|
|
|
ext v7.16b, v16.16b, v17.16b, #7
|
|
|
|
QPEL_UNI_W_H_CALC v16, v2, v1, v3, v18, v19, v20, v21
|
|
|
|
QPEL_UNI_W_H_CALC v4, v6, v5, v7, v22, v23, v24, v25
|
|
|
|
sqxtn v18.4h, v18.4s
|
|
|
|
sqxtn2 v18.8h, v22.4s
|
|
|
|
sqxtn v19.4h, v20.4s
|
|
|
|
sqxtn2 v19.8h, v24.4s
|
|
|
|
trn1 v20.8h, v18.8h, v19.8h
|
|
|
|
trn2 v21.8h, v18.8h, v19.8h
|
|
|
|
sqxtun v26.8b, v20.8h
|
|
|
|
sqxtun2 v26.16b, v21.8h // 0-15
|
|
|
|
ext v1.16b, v17.16b, v17.16b, #1
|
|
|
|
ext v2.16b, v17.16b, v17.16b, #2
|
|
|
|
ext v3.16b, v17.16b, v17.16b, #3
|
|
|
|
ext v4.16b, v17.16b, v17.16b, #4
|
|
|
|
ext v5.16b, v17.16b, v17.16b, #5
|
|
|
|
ext v6.16b, v17.16b, v17.16b, #6
|
|
|
|
ext v7.16b, v17.16b, v17.16b, #7
|
|
|
|
zip1 v0.2d, v17.2d, v1.2d
|
|
|
|
zip1 v2.2d, v2.2d, v3.2d
|
|
|
|
zip1 v4.2d, v4.2d, v5.2d
|
|
|
|
zip1 v6.2d, v6.2d, v7.2d
|
|
|
|
QPEL_UNI_W_H_CALC v0, v2, v4, v6, v18, v19, v20, v21
|
|
|
|
sqxtn v18.4h, v18.4s
|
|
|
|
sqxtn2 v18.8h, v20.4s
|
|
|
|
sqxtun v27.8b, v18.8h
|
|
|
|
|
|
|
|
st1 {v26.16b}, [x0], #16
|
|
|
|
st1 {v27.8b}, [x0], x1
|
|
|
|
subs w4, w4, #1
|
|
|
|
b.hi 1b
|
|
|
|
ret
|
|
|
|
endfunc
|
|
|
|
|
|
|
|
|
|
|
|
function ff_hevc_put_hevc_qpel_uni_w_h32_8_neon_i8mm, export=1
|
|
|
|
QPEL_UNI_W_H_HEADER
|
|
|
|
1:
|
|
|
|
ld1 {v16.16b, v17.16b, v18.16b}, [x2], x3
|
|
|
|
ext v1.16b, v16.16b, v17.16b, #1
|
|
|
|
ext v2.16b, v16.16b, v17.16b, #2
|
|
|
|
ext v3.16b, v16.16b, v17.16b, #3
|
|
|
|
ext v4.16b, v16.16b, v17.16b, #4
|
|
|
|
ext v5.16b, v16.16b, v17.16b, #5
|
|
|
|
ext v6.16b, v16.16b, v17.16b, #6
|
|
|
|
ext v7.16b, v16.16b, v17.16b, #7
|
|
|
|
QPEL_UNI_W_H_CALC v16, v2, v1, v3, v0, v19, v20, v21
|
|
|
|
QPEL_UNI_W_H_CALC v4, v6, v5, v7, v22, v23, v24, v25
|
|
|
|
sqxtn v0.4h, v0.4s
|
|
|
|
sqxtn2 v0.8h, v22.4s
|
|
|
|
sqxtn v19.4h, v20.4s
|
|
|
|
sqxtn2 v19.8h, v24.4s
|
|
|
|
trn1 v20.8h, v0.8h, v19.8h
|
|
|
|
trn2 v21.8h, v0.8h, v19.8h
|
|
|
|
sqxtun v26.8b, v20.8h
|
|
|
|
sqxtun2 v26.16b, v21.8h // 0-15
|
|
|
|
ext v1.16b, v17.16b, v18.16b, #1
|
|
|
|
ext v2.16b, v17.16b, v18.16b, #2
|
|
|
|
ext v3.16b, v17.16b, v18.16b, #3
|
|
|
|
ext v4.16b, v17.16b, v18.16b, #4
|
|
|
|
ext v5.16b, v17.16b, v18.16b, #5
|
|
|
|
ext v6.16b, v17.16b, v18.16b, #6
|
|
|
|
ext v7.16b, v17.16b, v18.16b, #7
|
|
|
|
QPEL_UNI_W_H_CALC v17, v2, v1, v3, v0, v19, v20, v21
|
|
|
|
QPEL_UNI_W_H_CALC v4, v6, v5, v7, v22, v23, v24, v25
|
|
|
|
sqxtn v0.4h, v0.4s
|
|
|
|
sqxtn2 v0.8h, v22.4s
|
|
|
|
sqxtn v19.4h, v20.4s
|
|
|
|
sqxtn2 v19.8h, v24.4s
|
|
|
|
trn1 v20.8h, v0.8h, v19.8h
|
|
|
|
trn2 v21.8h, v0.8h, v19.8h
|
|
|
|
sqxtun v27.8b, v20.8h
|
|
|
|
sqxtun2 v27.16b, v21.8h // 16-31
|
|
|
|
st1 {v26.16b, v27.16b}, [x0], x1
|
|
|
|
subs w4, w4, #1
|
|
|
|
b.hi 1b
|
|
|
|
ret
|
|
|
|
endfunc
|
|
|
|
|
|
|
|
function ff_hevc_put_hevc_qpel_uni_w_h48_8_neon_i8mm, export=1
|
|
|
|
QPEL_UNI_W_H_HEADER
|
|
|
|
1:
|
|
|
|
ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x2], x3
|
|
|
|
ext v1.16b, v16.16b, v17.16b, #1
|
|
|
|
ext v2.16b, v16.16b, v17.16b, #2
|
|
|
|
ext v3.16b, v16.16b, v17.16b, #3
|
|
|
|
ext v4.16b, v16.16b, v17.16b, #4
|
|
|
|
ext v5.16b, v16.16b, v17.16b, #5
|
|
|
|
ext v6.16b, v16.16b, v17.16b, #6
|
|
|
|
ext v7.16b, v16.16b, v17.16b, #7
|
|
|
|
QPEL_UNI_W_H_CALC v16, v2, v1, v3, v20, v24, v21, v0
|
|
|
|
QPEL_UNI_W_H_CALC v4, v6, v5, v7, v22, v24, v23, v0
|
|
|
|
sqxtn v20.4h, v20.4s
|
|
|
|
sqxtn2 v20.8h, v22.4s
|
|
|
|
sqxtn v21.4h, v21.4s
|
|
|
|
sqxtn2 v21.8h, v23.4s
|
|
|
|
trn1 v22.8h, v20.8h, v21.8h
|
|
|
|
trn2 v23.8h, v20.8h, v21.8h
|
|
|
|
sqxtun v25.8b, v22.8h
|
|
|
|
sqxtun2 v25.16b, v23.8h // 0-15
|
|
|
|
ext v1.16b, v17.16b, v18.16b, #1
|
|
|
|
ext v2.16b, v17.16b, v18.16b, #2
|
|
|
|
ext v3.16b, v17.16b, v18.16b, #3
|
|
|
|
ext v4.16b, v17.16b, v18.16b, #4
|
|
|
|
ext v5.16b, v17.16b, v18.16b, #5
|
|
|
|
ext v6.16b, v17.16b, v18.16b, #6
|
|
|
|
ext v7.16b, v17.16b, v18.16b, #7
|
|
|
|
QPEL_UNI_W_H_CALC v17, v2, v1, v3, v20, v24, v21, v0
|
|
|
|
QPEL_UNI_W_H_CALC v4, v6, v5, v7, v22, v24, v23, v0
|
|
|
|
sqxtn v20.4h, v20.4s
|
|
|
|
sqxtn2 v20.8h, v22.4s
|
|
|
|
sqxtn v21.4h, v21.4s
|
|
|
|
sqxtn2 v21.8h, v23.4s
|
|
|
|
trn1 v22.8h, v20.8h, v21.8h
|
|
|
|
trn2 v23.8h, v20.8h, v21.8h
|
|
|
|
sqxtun v26.8b, v22.8h
|
|
|
|
sqxtun2 v26.16b, v23.8h // 16-31
|
|
|
|
ext v1.16b, v18.16b, v19.16b, #1
|
|
|
|
ext v2.16b, v18.16b, v19.16b, #2
|
|
|
|
ext v3.16b, v18.16b, v19.16b, #3
|
|
|
|
ext v4.16b, v18.16b, v19.16b, #4
|
|
|
|
ext v5.16b, v18.16b, v19.16b, #5
|
|
|
|
ext v6.16b, v18.16b, v19.16b, #6
|
|
|
|
ext v7.16b, v18.16b, v19.16b, #7
|
|
|
|
QPEL_UNI_W_H_CALC v18, v2, v1, v3, v20, v24, v21, v0
|
|
|
|
QPEL_UNI_W_H_CALC v4, v6, v5, v7, v22, v24, v23, v0
|
|
|
|
sqxtn v20.4h, v20.4s
|
|
|
|
sqxtn2 v20.8h, v22.4s
|
|
|
|
sqxtn v21.4h, v21.4s
|
|
|
|
sqxtn2 v21.8h, v23.4s
|
|
|
|
trn1 v22.8h, v20.8h, v21.8h
|
|
|
|
trn2 v23.8h, v20.8h, v21.8h
|
|
|
|
sqxtun v27.8b, v22.8h
|
|
|
|
sqxtun2 v27.16b, v23.8h // 32-47
|
|
|
|
st1 {v25.16b, v26.16b, v27.16b}, [x0], x1
|
|
|
|
subs w4, w4, #1
|
|
|
|
b.hi 1b
|
|
|
|
ret
|
|
|
|
endfunc
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
function ff_hevc_put_hevc_qpel_uni_w_h64_8_neon_i8mm, export=1
|
|
|
|
QPEL_UNI_W_H_HEADER
|
|
|
|
sub x3, x3, #64
|
|
|
|
1:
|
|
|
|
ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x2], #64
|
|
|
|
ext v1.16b, v16.16b, v17.16b, #1
|
|
|
|
ext v2.16b, v16.16b, v17.16b, #2
|
|
|
|
ext v3.16b, v16.16b, v17.16b, #3
|
|
|
|
ext v4.16b, v16.16b, v17.16b, #4
|
|
|
|
ext v5.16b, v16.16b, v17.16b, #5
|
|
|
|
ext v6.16b, v16.16b, v17.16b, #6
|
|
|
|
ext v7.16b, v16.16b, v17.16b, #7
|
|
|
|
QPEL_UNI_W_H_CALC v16, v2, v1, v3, v20, v24, v21, v0
|
|
|
|
QPEL_UNI_W_H_CALC v4, v6, v5, v7, v22, v24, v23, v0
|
|
|
|
sqxtn v20.4h, v20.4s
|
|
|
|
sqxtn2 v20.8h, v22.4s
|
|
|
|
sqxtn v21.4h, v21.4s
|
|
|
|
sqxtn2 v21.8h, v23.4s
|
|
|
|
trn1 v22.8h, v20.8h, v21.8h
|
|
|
|
trn2 v23.8h, v20.8h, v21.8h
|
|
|
|
sqxtun v16.8b, v22.8h
|
|
|
|
sqxtun2 v16.16b, v23.8h // 0-15
|
|
|
|
ext v1.16b, v17.16b, v18.16b, #1
|
|
|
|
ext v2.16b, v17.16b, v18.16b, #2
|
|
|
|
ext v3.16b, v17.16b, v18.16b, #3
|
|
|
|
ext v4.16b, v17.16b, v18.16b, #4
|
|
|
|
ext v5.16b, v17.16b, v18.16b, #5
|
|
|
|
ext v6.16b, v17.16b, v18.16b, #6
|
|
|
|
ext v7.16b, v17.16b, v18.16b, #7
|
|
|
|
QPEL_UNI_W_H_CALC v17, v2, v1, v3, v20, v24, v21, v0
|
|
|
|
QPEL_UNI_W_H_CALC v4, v6, v5, v7, v22, v24, v23, v0
|
|
|
|
sqxtn v20.4h, v20.4s
|
|
|
|
sqxtn2 v20.8h, v22.4s
|
|
|
|
sqxtn v21.4h, v21.4s
|
|
|
|
sqxtn2 v21.8h, v23.4s
|
|
|
|
trn1 v22.8h, v20.8h, v21.8h
|
|
|
|
trn2 v23.8h, v20.8h, v21.8h
|
|
|
|
sqxtun v17.8b, v22.8h
|
|
|
|
sqxtun2 v17.16b, v23.8h // 16-31
|
|
|
|
ext v1.16b, v18.16b, v19.16b, #1
|
|
|
|
ext v2.16b, v18.16b, v19.16b, #2
|
|
|
|
ext v3.16b, v18.16b, v19.16b, #3
|
|
|
|
ext v4.16b, v18.16b, v19.16b, #4
|
|
|
|
ext v5.16b, v18.16b, v19.16b, #5
|
|
|
|
ext v6.16b, v18.16b, v19.16b, #6
|
|
|
|
ext v7.16b, v18.16b, v19.16b, #7
|
|
|
|
QPEL_UNI_W_H_CALC v18, v2, v1, v3, v20, v24, v21, v0
|
|
|
|
QPEL_UNI_W_H_CALC v4, v6, v5, v7, v22, v24, v23, v0
|
|
|
|
ld1 {v0.16b}, [x2], x3
|
|
|
|
sqxtn v20.4h, v20.4s
|
|
|
|
sqxtn2 v20.8h, v22.4s
|
|
|
|
sqxtn v21.4h, v21.4s
|
|
|
|
sqxtn2 v21.8h, v23.4s
|
|
|
|
trn1 v22.8h, v20.8h, v21.8h
|
|
|
|
trn2 v23.8h, v20.8h, v21.8h
|
|
|
|
sqxtun v18.8b, v22.8h
|
|
|
|
sqxtun2 v18.16b, v23.8h // 32-47
|
|
|
|
ext v1.16b, v19.16b, v0.16b, #1
|
|
|
|
ext v2.16b, v19.16b, v0.16b, #2
|
|
|
|
ext v3.16b, v19.16b, v0.16b, #3
|
|
|
|
ext v4.16b, v19.16b, v0.16b, #4
|
|
|
|
ext v5.16b, v19.16b, v0.16b, #5
|
|
|
|
ext v6.16b, v19.16b, v0.16b, #6
|
|
|
|
ext v7.16b, v19.16b, v0.16b, #7
|
|
|
|
QPEL_UNI_W_H_CALC v19, v2, v1, v3, v20, v24, v21, v0
|
|
|
|
QPEL_UNI_W_H_CALC v4, v6, v5, v7, v22, v24, v23, v0
|
|
|
|
sqxtn v20.4h, v20.4s
|
|
|
|
sqxtn2 v20.8h, v22.4s
|
|
|
|
sqxtn v21.4h, v21.4s
|
|
|
|
sqxtn2 v21.8h, v23.4s
|
|
|
|
trn1 v22.8h, v20.8h, v21.8h
|
|
|
|
trn2 v23.8h, v20.8h, v21.8h
|
|
|
|
sqxtun v19.8b, v22.8h
|
|
|
|
sqxtun2 v19.16b, v23.8h // 48-63
|
|
|
|
|
|
|
|
st1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x0], x1
|
|
|
|
subs w4, w4, #1
|
|
|
|
b.hi 1b
|
|
|
|
ret
|
|
|
|
endfunc
|
|
|
|
|
|
|
|
.macro QPEL_H_HEADER
|
|
|
|
movrel x9, qpel_filters
|
|
|
|
add x9, x9, x4, lsl #3
|
|
|
|
ldr x11, [x9]
|
|
|
|
dup v31.2d, x11
|
|
|
|
sub x1, x1, #3
|
|
|
|
.endm
|
|
|
|
|
|
|
|
function ff_hevc_put_hevc_qpel_h4_8_neon_i8mm, export=1
|
|
|
|
QPEL_H_HEADER
|
|
|
|
mov x10, #MAX_PB_SIZE * 2
|
|
|
|
1:
|
|
|
|
ld1 {v0.16b}, [x1], x2
|
|
|
|
ext v1.16b, v0.16b, v0.16b, #1
|
|
|
|
ext v2.16b, v0.16b, v0.16b, #2
|
|
|
|
ext v3.16b, v0.16b, v0.16b, #3
|
|
|
|
zip1 v0.2d, v0.2d, v1.2d
|
|
|
|
zip1 v2.2d, v2.2d, v3.2d
|
|
|
|
movi v16.16b, #0
|
|
|
|
movi v17.16b, #0
|
|
|
|
usdot v16.4s, v0.16b, v31.16b
|
|
|
|
usdot v17.4s, v2.16b, v31.16b
|
|
|
|
addp v16.4s, v16.4s, v17.4s
|
|
|
|
sqxtn v16.4h, v16.4s
|
|
|
|
str d16, [x0]
|
|
|
|
add x0, x0, x10
|
|
|
|
subs w3, w3, #1
|
|
|
|
b.ne 1b
|
|
|
|
ret
|
|
|
|
endfunc
|
|
|
|
|
|
|
|
function ff_hevc_put_hevc_qpel_h6_8_neon_i8mm, export=1
|
|
|
|
QPEL_H_HEADER
|
|
|
|
mov x10, #MAX_PB_SIZE * 2
|
|
|
|
add x15, x0, #8
|
|
|
|
1:
|
|
|
|
ld1 {v0.16b}, [x1], x2
|
|
|
|
ext v1.16b, v0.16b, v0.16b, #1
|
|
|
|
ext v2.16b, v0.16b, v0.16b, #2
|
|
|
|
ext v3.16b, v0.16b, v0.16b, #3
|
|
|
|
ext v4.16b, v0.16b, v0.16b, #4
|
|
|
|
ext v5.16b, v0.16b, v0.16b, #5
|
|
|
|
zip1 v0.2d, v0.2d, v1.2d
|
|
|
|
zip1 v2.2d, v2.2d, v3.2d
|
|
|
|
zip1 v4.2d, v4.2d, v5.2d
|
|
|
|
movi v16.16b, #0
|
|
|
|
movi v17.16b, #0
|
|
|
|
movi v18.16b, #0
|
|
|
|
usdot v16.4s, v0.16b, v31.16b
|
|
|
|
usdot v17.4s, v2.16b, v31.16b
|
|
|
|
usdot v18.4s, v4.16b, v31.16b
|
|
|
|
addp v16.4s, v16.4s, v17.4s
|
|
|
|
addp v18.4s, v18.4s, v18.4s
|
|
|
|
sqxtn v16.4h, v16.4s
|
|
|
|
sqxtn v18.4h, v18.4s
|
|
|
|
str d16, [x0]
|
|
|
|
str s18, [x15]
|
|
|
|
add x0, x0, x10
|
|
|
|
add x15, x15, x10
|
|
|
|
subs w3, w3, #1
|
|
|
|
b.ne 1b
|
|
|
|
ret
|
|
|
|
endfunc
|
|
|
|
|
|
|
|
function ff_hevc_put_hevc_qpel_h8_8_neon_i8mm, export=1
|
|
|
|
QPEL_H_HEADER
|
|
|
|
mov x10, #MAX_PB_SIZE * 2
|
|
|
|
1:
|
|
|
|
ld1 {v0.16b}, [x1], x2
|
|
|
|
ext v1.16b, v0.16b, v0.16b, #1
|
|
|
|
ext v2.16b, v0.16b, v0.16b, #2
|
|
|
|
ext v3.16b, v0.16b, v0.16b, #3
|
|
|
|
ext v4.16b, v0.16b, v0.16b, #4
|
|
|
|
ext v5.16b, v0.16b, v0.16b, #5
|
|
|
|
ext v6.16b, v0.16b, v0.16b, #6
|
|
|
|
ext v7.16b, v0.16b, v0.16b, #7
|
|
|
|
zip1 v0.2d, v0.2d, v1.2d
|
|
|
|
zip1 v2.2d, v2.2d, v3.2d
|
|
|
|
zip1 v4.2d, v4.2d, v5.2d
|
|
|
|
zip1 v6.2d, v6.2d, v7.2d
|
|
|
|
movi v16.16b, #0
|
|
|
|
movi v17.16b, #0
|
|
|
|
movi v18.16b, #0
|
|
|
|
movi v19.16b, #0
|
|
|
|
usdot v16.4s, v0.16b, v31.16b
|
|
|
|
usdot v17.4s, v2.16b, v31.16b
|
|
|
|
usdot v18.4s, v4.16b, v31.16b
|
|
|
|
usdot v19.4s, v6.16b, v31.16b
|
|
|
|
addp v16.4s, v16.4s, v17.4s
|
|
|
|
addp v18.4s, v18.4s, v19.4s
|
|
|
|
sqxtn v16.4h, v16.4s
|
|
|
|
sqxtn2 v16.8h, v18.4s
|
|
|
|
str q16, [x0]
|
|
|
|
add x0, x0, x10
|
|
|
|
subs w3, w3, #1
|
|
|
|
b.ne 1b
|
|
|
|
ret
|
|
|
|
endfunc
|
|
|
|
|
|
|
|
.macro QPEL_H_CALC s0, s1, s2, s3, d0, d1, d2, d3
|
|
|
|
movi \d0\().16b, #0
|
|
|
|
movi \d1\().16b, #0
|
|
|
|
movi \d2\().16b, #0
|
|
|
|
movi \d3\().16b, #0
|
|
|
|
usdot \d0\().4s, \s0\().16b, v31.16b
|
|
|
|
usdot \d1\().4s, \s1\().16b, v31.16b
|
|
|
|
usdot \d2\().4s, \s2\().16b, v31.16b
|
|
|
|
usdot \d3\().4s, \s3\().16b, v31.16b
|
|
|
|
.endm
|
|
|
|
|
|
|
|
function ff_hevc_put_hevc_qpel_h12_8_neon_i8mm, export=1
|
|
|
|
QPEL_H_HEADER
|
|
|
|
mov x10, #MAX_PB_SIZE * 2
|
|
|
|
add x15, x0, #16
|
|
|
|
1:
|
|
|
|
ld1 {v16.16b, v17.16b}, [x1], x2
|
|
|
|
ext v1.16b, v16.16b, v17.16b, #1
|
|
|
|
ext v2.16b, v16.16b, v17.16b, #2
|
|
|
|
ext v3.16b, v16.16b, v17.16b, #3
|
|
|
|
ext v4.16b, v16.16b, v17.16b, #4
|
|
|
|
ext v5.16b, v16.16b, v17.16b, #5
|
|
|
|
ext v6.16b, v16.16b, v17.16b, #6
|
|
|
|
ext v7.16b, v16.16b, v17.16b, #7
|
|
|
|
zip1 v18.2d, v4.2d, v5.2d
|
|
|
|
zip1 v19.2d, v6.2d, v7.2d
|
|
|
|
QPEL_H_CALC v16, v1, v2, v3, v20, v21, v22, v23
|
|
|
|
addp v20.4s, v20.4s, v22.4s
|
|
|
|
addp v21.4s, v21.4s, v23.4s
|
|
|
|
movi v24.16b, #0
|
|
|
|
movi v25.16b, #0
|
|
|
|
usdot v24.4s, v18.16b, v31.16b
|
|
|
|
usdot v25.4s, v19.16b, v31.16b
|
|
|
|
addp v24.4s, v24.4s, v25.4s
|
|
|
|
trn1 v26.4s, v20.4s, v21.4s
|
|
|
|
trn2 v27.4s, v20.4s, v21.4s
|
|
|
|
sqxtn v26.4h, v26.4s
|
|
|
|
sqxtn v27.4h, v27.4s
|
|
|
|
sqxtn2 v26.8h, v24.4s
|
|
|
|
|
|
|
|
str q26, [x0]
|
|
|
|
str d27, [x15]
|
|
|
|
add x0, x0, x10
|
|
|
|
add x15, x15, x10
|
|
|
|
subs w3, w3, #1
|
|
|
|
b.ne 1b
|
|
|
|
ret
|
|
|
|
endfunc
|
|
|
|
|
|
|
|
function ff_hevc_put_hevc_qpel_h16_8_neon_i8mm, export=1
|
|
|
|
QPEL_H_HEADER
|
|
|
|
mov x10, #MAX_PB_SIZE * 2
|
|
|
|
1:
|
|
|
|
ld1 {v16.16b, v17.16b}, [x1], x2
|
|
|
|
ext v1.16b, v16.16b, v17.16b, #1
|
|
|
|
ext v2.16b, v16.16b, v17.16b, #2
|
|
|
|
ext v3.16b, v16.16b, v17.16b, #3
|
|
|
|
ext v4.16b, v16.16b, v17.16b, #4
|
|
|
|
ext v5.16b, v16.16b, v17.16b, #5
|
|
|
|
ext v6.16b, v16.16b, v17.16b, #6
|
|
|
|
ext v7.16b, v16.16b, v17.16b, #7
|
|
|
|
|
|
|
|
QPEL_H_CALC v16, v1, v2, v3, v20, v21, v22, v23
|
|
|
|
QPEL_H_CALC v4, v5, v6, v7, v24, v25, v26, v27
|
|
|
|
|
|
|
|
addp v20.4s, v20.4s, v22.4s
|
|
|
|
addp v21.4s, v21.4s, v23.4s
|
|
|
|
addp v24.4s, v24.4s, v26.4s
|
|
|
|
addp v25.4s, v25.4s, v27.4s
|
|
|
|
|
|
|
|
trn1 v22.4s, v20.4s, v21.4s
|
|
|
|
trn2 v23.4s, v20.4s, v21.4s
|
|
|
|
trn1 v26.4s, v24.4s, v25.4s
|
|
|
|
trn2 v27.4s, v24.4s, v25.4s
|
|
|
|
|
|
|
|
sqxtn v18.4h, v22.4s
|
|
|
|
sqxtn2 v18.8h, v26.4s
|
|
|
|
sqxtn v19.4h, v23.4s
|
|
|
|
sqxtn2 v19.8h, v27.4s
|
|
|
|
|
|
|
|
stp q18, q19, [x0]
|
|
|
|
add x0, x0, x10
|
|
|
|
subs w3, w3, #1
|
|
|
|
b.ne 1b
|
|
|
|
ret
|
|
|
|
endfunc
|
|
|
|
|
|
|
|
function ff_hevc_put_hevc_qpel_h24_8_neon_i8mm, export=1
|
|
|
|
QPEL_H_HEADER
|
|
|
|
mov x10, #MAX_PB_SIZE * 2
|
|
|
|
add x15, x0, #32
|
|
|
|
1:
|
|
|
|
ld1 {v16.16b, v17.16b}, [x1], x2
|
|
|
|
ext v1.16b, v16.16b, v17.16b, #1
|
|
|
|
ext v2.16b, v16.16b, v17.16b, #2
|
|
|
|
ext v3.16b, v16.16b, v17.16b, #3
|
|
|
|
ext v4.16b, v16.16b, v17.16b, #4
|
|
|
|
ext v5.16b, v16.16b, v17.16b, #5
|
|
|
|
ext v6.16b, v16.16b, v17.16b, #6
|
|
|
|
ext v7.16b, v16.16b, v17.16b, #7
|
|
|
|
QPEL_H_CALC v16, v1, v2, v3, v20, v21, v22, v23
|
|
|
|
QPEL_H_CALC v4, v5, v6, v7, v24, v25, v26, v27
|
|
|
|
addp v20.4s, v20.4s, v22.4s
|
|
|
|
addp v21.4s, v21.4s, v23.4s
|
|
|
|
addp v24.4s, v24.4s, v26.4s
|
|
|
|
addp v25.4s, v25.4s, v27.4s
|
|
|
|
trn1 v22.4s, v20.4s, v21.4s
|
|
|
|
trn2 v23.4s, v20.4s, v21.4s
|
|
|
|
trn1 v26.4s, v24.4s, v25.4s
|
|
|
|
trn2 v27.4s, v24.4s, v25.4s
|
|
|
|
sqxtn v18.4h, v22.4s
|
|
|
|
sqxtn2 v18.8h, v26.4s
|
|
|
|
sqxtn v19.4h, v23.4s
|
|
|
|
sqxtn2 v19.8h, v27.4s
|
|
|
|
stp q18, q19, [x0]
|
|
|
|
add x0, x0, x10
|
|
|
|
ext v1.16b, v17.16b, v17.16b, #1
|
|
|
|
ext v2.16b, v17.16b, v17.16b, #2
|
|
|
|
ext v3.16b, v17.16b, v17.16b, #3
|
|
|
|
ext v4.16b, v17.16b, v17.16b, #4
|
|
|
|
ext v5.16b, v17.16b, v17.16b, #5
|
|
|
|
ext v6.16b, v17.16b, v17.16b, #6
|
|
|
|
ext v7.16b, v17.16b, v17.16b, #7
|
|
|
|
zip1 v0.2d, v17.2d, v1.2d
|
|
|
|
zip1 v2.2d, v2.2d, v3.2d
|
|
|
|
zip1 v4.2d, v4.2d, v5.2d
|
|
|
|
zip1 v6.2d, v6.2d, v7.2d
|
|
|
|
QPEL_H_CALC v0, v2, v4, v6, v20, v21, v22, v23
|
|
|
|
addp v20.4s, v20.4s, v21.4s
|
|
|
|
addp v22.4s, v22.4s, v23.4s
|
|
|
|
sqxtn v20.4h, v20.4s
|
|
|
|
sqxtn2 v20.8h, v22.4s
|
|
|
|
str q20, [x15]
|
|
|
|
add x15, x15, x10
|
|
|
|
subs w3, w3, #1
|
|
|
|
b.ne 1b
|
|
|
|
ret
|
|
|
|
endfunc
|
|
|
|
|
|
|
|
function ff_hevc_put_hevc_qpel_h32_8_neon_i8mm, export=1
|
|
|
|
QPEL_H_HEADER
|
|
|
|
mov x10, #MAX_PB_SIZE * 2
|
|
|
|
add x15, x0, #32
|
|
|
|
1:
|
|
|
|
ld1 {v16.16b, v17.16b, v18.16b}, [x1], x2
|
|
|
|
ext v1.16b, v16.16b, v17.16b, #1
|
|
|
|
ext v2.16b, v16.16b, v17.16b, #2
|
|
|
|
ext v3.16b, v16.16b, v17.16b, #3
|
|
|
|
ext v4.16b, v16.16b, v17.16b, #4
|
|
|
|
ext v5.16b, v16.16b, v17.16b, #5
|
|
|
|
ext v6.16b, v16.16b, v17.16b, #6
|
|
|
|
ext v7.16b, v16.16b, v17.16b, #7
|
|
|
|
QPEL_H_CALC v16, v1, v2, v3, v20, v21, v22, v23
|
|
|
|
QPEL_H_CALC v4, v5, v6, v7, v24, v25, v26, v27
|
|
|
|
addp v20.4s, v20.4s, v22.4s
|
|
|
|
addp v21.4s, v21.4s, v23.4s
|
|
|
|
addp v24.4s, v24.4s, v26.4s
|
|
|
|
addp v25.4s, v25.4s, v27.4s
|
|
|
|
trn1 v22.4s, v20.4s, v21.4s
|
|
|
|
trn2 v23.4s, v20.4s, v21.4s
|
|
|
|
trn1 v26.4s, v24.4s, v25.4s
|
|
|
|
trn2 v27.4s, v24.4s, v25.4s
|
|
|
|
sqxtn v20.4h, v22.4s
|
|
|
|
sqxtn2 v20.8h, v26.4s
|
|
|
|
sqxtn v21.4h, v23.4s
|
|
|
|
sqxtn2 v21.8h, v27.4s
|
|
|
|
stp q20, q21, [x0]
|
|
|
|
add x0, x0, x10
|
|
|
|
ext v1.16b, v17.16b, v18.16b, #1
|
|
|
|
ext v2.16b, v17.16b, v18.16b, #2
|
|
|
|
ext v3.16b, v17.16b, v18.16b, #3
|
|
|
|
ext v4.16b, v17.16b, v18.16b, #4
|
|
|
|
ext v5.16b, v17.16b, v18.16b, #5
|
|
|
|
ext v6.16b, v17.16b, v18.16b, #6
|
|
|
|
ext v7.16b, v17.16b, v18.16b, #7
|
|
|
|
QPEL_H_CALC v17, v1, v2, v3, v20, v21, v22, v23
|
|
|
|
QPEL_H_CALC v4, v5, v6, v7, v24, v25, v26, v27
|
|
|
|
addp v20.4s, v20.4s, v22.4s
|
|
|
|
addp v21.4s, v21.4s, v23.4s
|
|
|
|
addp v24.4s, v24.4s, v26.4s
|
|
|
|
addp v25.4s, v25.4s, v27.4s
|
|
|
|
trn1 v22.4s, v20.4s, v21.4s
|
|
|
|
trn2 v23.4s, v20.4s, v21.4s
|
|
|
|
trn1 v26.4s, v24.4s, v25.4s
|
|
|
|
trn2 v27.4s, v24.4s, v25.4s
|
|
|
|
sqxtn v20.4h, v22.4s
|
|
|
|
sqxtn2 v20.8h, v26.4s
|
|
|
|
sqxtn v21.4h, v23.4s
|
|
|
|
sqxtn2 v21.8h, v27.4s
|
|
|
|
stp q20, q21, [x15]
|
|
|
|
add x15, x15, x10
|
|
|
|
subs w3, w3, #1
|
|
|
|
b.ne 1b
|
|
|
|
ret
|
|
|
|
endfunc
|
|
|
|
|
|
|
|
function ff_hevc_put_hevc_qpel_h48_8_neon_i8mm, export=1
|
|
|
|
QPEL_H_HEADER
|
|
|
|
mov x10, #MAX_PB_SIZE * 2 - 64
|
|
|
|
1:
|
|
|
|
ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x1], x2
|
|
|
|
ext v1.16b, v16.16b, v17.16b, #1
|
|
|
|
ext v2.16b, v16.16b, v17.16b, #2
|
|
|
|
ext v3.16b, v16.16b, v17.16b, #3
|
|
|
|
ext v4.16b, v16.16b, v17.16b, #4
|
|
|
|
ext v5.16b, v16.16b, v17.16b, #5
|
|
|
|
ext v6.16b, v16.16b, v17.16b, #6
|
|
|
|
ext v7.16b, v16.16b, v17.16b, #7
|
|
|
|
QPEL_H_CALC v16, v1, v2, v3, v20, v21, v22, v23
|
|
|
|
QPEL_H_CALC v4, v5, v6, v7, v24, v25, v26, v27
|
|
|
|
addp v20.4s, v20.4s, v22.4s
|
|
|
|
addp v21.4s, v21.4s, v23.4s
|
|
|
|
addp v24.4s, v24.4s, v26.4s
|
|
|
|
addp v25.4s, v25.4s, v27.4s
|
|
|
|
trn1 v22.4s, v20.4s, v21.4s
|
|
|
|
trn2 v23.4s, v20.4s, v21.4s
|
|
|
|
trn1 v26.4s, v24.4s, v25.4s
|
|
|
|
trn2 v27.4s, v24.4s, v25.4s
|
|
|
|
sqxtn v20.4h, v22.4s
|
|
|
|
sqxtn2 v20.8h, v26.4s
|
|
|
|
sqxtn v21.4h, v23.4s
|
|
|
|
sqxtn2 v21.8h, v27.4s
|
|
|
|
stp q20, q21, [x0], #32
|
|
|
|
|
|
|
|
ext v1.16b, v17.16b, v18.16b, #1
|
|
|
|
ext v2.16b, v17.16b, v18.16b, #2
|
|
|
|
ext v3.16b, v17.16b, v18.16b, #3
|
|
|
|
ext v4.16b, v17.16b, v18.16b, #4
|
|
|
|
ext v5.16b, v17.16b, v18.16b, #5
|
|
|
|
ext v6.16b, v17.16b, v18.16b, #6
|
|
|
|
ext v7.16b, v17.16b, v18.16b, #7
|
|
|
|
QPEL_H_CALC v17, v1, v2, v3, v20, v21, v22, v23
|
|
|
|
QPEL_H_CALC v4, v5, v6, v7, v24, v25, v26, v27
|
|
|
|
addp v20.4s, v20.4s, v22.4s
|
|
|
|
addp v21.4s, v21.4s, v23.4s
|
|
|
|
addp v24.4s, v24.4s, v26.4s
|
|
|
|
addp v25.4s, v25.4s, v27.4s
|
|
|
|
trn1 v22.4s, v20.4s, v21.4s
|
|
|
|
trn2 v23.4s, v20.4s, v21.4s
|
|
|
|
trn1 v26.4s, v24.4s, v25.4s
|
|
|
|
trn2 v27.4s, v24.4s, v25.4s
|
|
|
|
sqxtn v20.4h, v22.4s
|
|
|
|
sqxtn2 v20.8h, v26.4s
|
|
|
|
sqxtn v21.4h, v23.4s
|
|
|
|
sqxtn2 v21.8h, v27.4s
|
|
|
|
stp q20, q21, [x0], #32
|
|
|
|
ext v1.16b, v18.16b, v19.16b, #1
|
|
|
|
ext v2.16b, v18.16b, v19.16b, #2
|
|
|
|
ext v3.16b, v18.16b, v19.16b, #3
|
|
|
|
ext v4.16b, v18.16b, v19.16b, #4
|
|
|
|
ext v5.16b, v18.16b, v19.16b, #5
|
|
|
|
ext v6.16b, v18.16b, v19.16b, #6
|
|
|
|
ext v7.16b, v18.16b, v19.16b, #7
|
|
|
|
QPEL_H_CALC v18, v1, v2, v3, v20, v21, v22, v23
|
|
|
|
QPEL_H_CALC v4, v5, v6, v7, v24, v25, v26, v27
|
|
|
|
addp v20.4s, v20.4s, v22.4s
|
|
|
|
addp v21.4s, v21.4s, v23.4s
|
|
|
|
addp v24.4s, v24.4s, v26.4s
|
|
|
|
addp v25.4s, v25.4s, v27.4s
|
|
|
|
trn1 v22.4s, v20.4s, v21.4s
|
|
|
|
trn2 v23.4s, v20.4s, v21.4s
|
|
|
|
trn1 v26.4s, v24.4s, v25.4s
|
|
|
|
trn2 v27.4s, v24.4s, v25.4s
|
|
|
|
sqxtn v20.4h, v22.4s
|
|
|
|
sqxtn2 v20.8h, v26.4s
|
|
|
|
sqxtn v21.4h, v23.4s
|
|
|
|
sqxtn2 v21.8h, v27.4s
|
|
|
|
stp q20, q21, [x0]
|
|
|
|
add x0, x0, x10
|
|
|
|
subs w3, w3, #1
|
|
|
|
b.ne 1b
|
|
|
|
ret
|
|
|
|
endfunc
|
|
|
|
|
|
|
|
function ff_hevc_put_hevc_qpel_h64_8_neon_i8mm, export=1
|
|
|
|
QPEL_H_HEADER
|
|
|
|
sub x2, x2, #64
|
|
|
|
1:
|
|
|
|
ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x1], #64
|
|
|
|
ext v1.16b, v16.16b, v17.16b, #1
|
|
|
|
ext v2.16b, v16.16b, v17.16b, #2
|
|
|
|
ext v3.16b, v16.16b, v17.16b, #3
|
|
|
|
ext v4.16b, v16.16b, v17.16b, #4
|
|
|
|
ext v5.16b, v16.16b, v17.16b, #5
|
|
|
|
ext v6.16b, v16.16b, v17.16b, #6
|
|
|
|
ext v7.16b, v16.16b, v17.16b, #7
|
|
|
|
QPEL_H_CALC v16, v1, v2, v3, v20, v21, v22, v23
|
|
|
|
QPEL_H_CALC v4, v5, v6, v7, v24, v25, v26, v27
|
|
|
|
addp v20.4s, v20.4s, v22.4s
|
|
|
|
addp v21.4s, v21.4s, v23.4s
|
|
|
|
addp v24.4s, v24.4s, v26.4s
|
|
|
|
addp v25.4s, v25.4s, v27.4s
|
|
|
|
trn1 v22.4s, v20.4s, v21.4s
|
|
|
|
trn2 v23.4s, v20.4s, v21.4s
|
|
|
|
trn1 v26.4s, v24.4s, v25.4s
|
|
|
|
trn2 v27.4s, v24.4s, v25.4s
|
|
|
|
sqxtn v20.4h, v22.4s
|
|
|
|
sqxtn2 v20.8h, v26.4s
|
|
|
|
sqxtn v21.4h, v23.4s
|
|
|
|
sqxtn2 v21.8h, v27.4s
|
|
|
|
stp q20, q21, [x0], #32
|
|
|
|
|
|
|
|
ext v1.16b, v17.16b, v18.16b, #1
|
|
|
|
ext v2.16b, v17.16b, v18.16b, #2
|
|
|
|
ext v3.16b, v17.16b, v18.16b, #3
|
|
|
|
ext v4.16b, v17.16b, v18.16b, #4
|
|
|
|
ext v5.16b, v17.16b, v18.16b, #5
|
|
|
|
ext v6.16b, v17.16b, v18.16b, #6
|
|
|
|
ext v7.16b, v17.16b, v18.16b, #7
|
|
|
|
QPEL_H_CALC v17, v1, v2, v3, v20, v21, v22, v23
|
|
|
|
QPEL_H_CALC v4, v5, v6, v7, v24, v25, v26, v27
|
|
|
|
addp v20.4s, v20.4s, v22.4s
|
|
|
|
addp v21.4s, v21.4s, v23.4s
|
|
|
|
addp v24.4s, v24.4s, v26.4s
|
|
|
|
addp v25.4s, v25.4s, v27.4s
|
|
|
|
trn1 v22.4s, v20.4s, v21.4s
|
|
|
|
trn2 v23.4s, v20.4s, v21.4s
|
|
|
|
trn1 v26.4s, v24.4s, v25.4s
|
|
|
|
trn2 v27.4s, v24.4s, v25.4s
|
|
|
|
sqxtn v20.4h, v22.4s
|
|
|
|
sqxtn2 v20.8h, v26.4s
|
|
|
|
sqxtn v21.4h, v23.4s
|
|
|
|
sqxtn2 v21.8h, v27.4s
|
|
|
|
stp q20, q21, [x0], #32
|
|
|
|
ext v1.16b, v18.16b, v19.16b, #1
|
|
|
|
ext v2.16b, v18.16b, v19.16b, #2
|
|
|
|
ext v3.16b, v18.16b, v19.16b, #3
|
|
|
|
ext v4.16b, v18.16b, v19.16b, #4
|
|
|
|
ext v5.16b, v18.16b, v19.16b, #5
|
|
|
|
ext v6.16b, v18.16b, v19.16b, #6
|
|
|
|
ext v7.16b, v18.16b, v19.16b, #7
|
|
|
|
QPEL_H_CALC v18, v1, v2, v3, v20, v21, v22, v23
|
|
|
|
QPEL_H_CALC v4, v5, v6, v7, v24, v25, v26, v27
|
|
|
|
addp v20.4s, v20.4s, v22.4s
|
|
|
|
addp v21.4s, v21.4s, v23.4s
|
|
|
|
addp v24.4s, v24.4s, v26.4s
|
|
|
|
addp v25.4s, v25.4s, v27.4s
|
|
|
|
trn1 v22.4s, v20.4s, v21.4s
|
|
|
|
trn2 v23.4s, v20.4s, v21.4s
|
|
|
|
trn1 v26.4s, v24.4s, v25.4s
|
|
|
|
trn2 v27.4s, v24.4s, v25.4s
|
|
|
|
sqxtn v20.4h, v22.4s
|
|
|
|
sqxtn2 v20.8h, v26.4s
|
|
|
|
sqxtn v21.4h, v23.4s
|
|
|
|
sqxtn2 v21.8h, v27.4s
|
|
|
|
stp q20, q21, [x0], #32
|
|
|
|
ld1 {v28.8b}, [x1], x2
|
|
|
|
ext v1.16b, v19.16b, v28.16b, #1
|
|
|
|
ext v2.16b, v19.16b, v28.16b, #2
|
|
|
|
ext v3.16b, v19.16b, v28.16b, #3
|
|
|
|
ext v4.16b, v19.16b, v28.16b, #4
|
|
|
|
ext v5.16b, v19.16b, v28.16b, #5
|
|
|
|
ext v6.16b, v19.16b, v28.16b, #6
|
|
|
|
ext v7.16b, v19.16b, v28.16b, #7
|
|
|
|
QPEL_H_CALC v19, v1, v2, v3, v20, v21, v22, v23
|
|
|
|
QPEL_H_CALC v4, v5, v6, v7, v24, v25, v26, v27
|
|
|
|
addp v20.4s, v20.4s, v22.4s
|
|
|
|
addp v21.4s, v21.4s, v23.4s
|
|
|
|
addp v24.4s, v24.4s, v26.4s
|
|
|
|
addp v25.4s, v25.4s, v27.4s
|
|
|
|
trn1 v22.4s, v20.4s, v21.4s
|
|
|
|
trn2 v23.4s, v20.4s, v21.4s
|
|
|
|
trn1 v26.4s, v24.4s, v25.4s
|
|
|
|
trn2 v27.4s, v24.4s, v25.4s
|
|
|
|
sqxtn v20.4h, v22.4s
|
|
|
|
sqxtn2 v20.8h, v26.4s
|
|
|
|
sqxtn v21.4h, v23.4s
|
|
|
|
sqxtn2 v21.8h, v27.4s
|
|
|
|
stp q20, q21, [x0], #32
|
|
|
|
subs w3, w3, #1
|
|
|
|
b.ne 1b
|
|
|
|
ret
|
|
|
|
endfunc
|
|
|
|
|
|
|
|
|
|
|
|
function ff_hevc_put_hevc_qpel_hv4_8_neon_i8mm, export=1
|
|
|
|
add w10, w3, #7
|
|
|
|
mov x7, #128
|
|
|
|
lsl x10, x10, #7
|
|
|
|
sub sp, sp, x10 // tmp_array
|
|
|
|
stp x5, x30, [sp, #-32]!
|
|
|
|
stp x0, x3, [sp, #16]
|
|
|
|
add x0, sp, #32
|
|
|
|
sub x1, x1, x2, lsl #1
|
|
|
|
add x3, x3, #7
|
|
|
|
sub x1, x1, x2
|
|
|
|
bl X(ff_hevc_put_hevc_qpel_h4_8_neon_i8mm)
|
|
|
|
ldp x0, x3, [sp, #16]
|
|
|
|
ldp x5, x30, [sp], #32
|
|
|
|
load_qpel_filterh x5, x4
|
|
|
|
ldr d16, [sp]
|
|
|
|
ldr d17, [sp, x7]
|
|
|
|
add sp, sp, x7, lsl #1
|
|
|
|
ldr d18, [sp]
|
|
|
|
ldr d19, [sp, x7]
|
|
|
|
add sp, sp, x7, lsl #1
|
|
|
|
ldr d20, [sp]
|
|
|
|
ldr d21, [sp, x7]
|
|
|
|
add sp, sp, x7, lsl #1
|
|
|
|
ldr d22, [sp]
|
|
|
|
add sp, sp, x7
|
|
|
|
.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
|
|
|
|
ld1 {\tmp\().4h}, [sp], x7
|
|
|
|
calc_qpelh v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqshrn
|
|
|
|
subs w3, w3, #1
|
|
|
|
st1 {v1.4h}, [x0], x7
|
|
|
|
.endm
|
|
|
|
1: calc_all
|
|
|
|
.purgem calc
|
|
|
|
2: ret
|
|
|
|
endfunc
|
|
|
|
|
|
|
|
function ff_hevc_put_hevc_qpel_hv6_8_neon_i8mm, export=1
|
|
|
|
add w10, w3, #7
|
|
|
|
mov x7, #128
|
|
|
|
lsl x10, x10, #7
|
|
|
|
sub sp, sp, x10 // tmp_array
|
|
|
|
stp x5, x30, [sp, #-32]!
|
|
|
|
stp x0, x3, [sp, #16]
|
|
|
|
add x0, sp, #32
|
|
|
|
sub x1, x1, x2, lsl #1
|
|
|
|
add x3, x3, #7
|
|
|
|
sub x1, x1, x2
|
|
|
|
bl X(ff_hevc_put_hevc_qpel_h6_8_neon_i8mm)
|
|
|
|
ldp x0, x3, [sp, #16]
|
|
|
|
ldp x5, x30, [sp], #32
|
|
|
|
mov x8, #120
|
|
|
|
load_qpel_filterh x5, x4
|
|
|
|
ldr q16, [sp]
|
|
|
|
ldr q17, [sp, x7]
|
|
|
|
add sp, sp, x7, lsl #1
|
|
|
|
ldr q18, [sp]
|
|
|
|
ldr q19, [sp, x7]
|
|
|
|
add sp, sp, x7, lsl #1
|
|
|
|
ldr q20, [sp]
|
|
|
|
ldr q21, [sp, x7]
|
|
|
|
add sp, sp, x7, lsl #1
|
|
|
|
ldr q22, [sp]
|
|
|
|
add sp, sp, x7
|
|
|
|
.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
|
|
|
|
ld1 {\tmp\().8h}, [sp], x7
|
|
|
|
calc_qpelh v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqshrn
|
|
|
|
calc_qpelh2 v1, v2, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqshrn2
|
|
|
|
st1 {v1.4h}, [x0], #8
|
|
|
|
subs w3, w3, #1
|
|
|
|
st1 {v1.s}[2], [x0], x8
|
|
|
|
.endm
|
|
|
|
1: calc_all
|
|
|
|
.purgem calc
|
|
|
|
2: ret
|
|
|
|
endfunc
|
|
|
|
|
|
|
|
function ff_hevc_put_hevc_qpel_hv8_8_neon_i8mm, export=1
|
|
|
|
add w10, w3, #7
|
|
|
|
lsl x10, x10, #7
|
|
|
|
sub x1, x1, x2, lsl #1
|
|
|
|
sub sp, sp, x10 // tmp_array
|
|
|
|
stp x5, x30, [sp, #-32]!
|
|
|
|
stp x0, x3, [sp, #16]
|
|
|
|
add x0, sp, #32
|
|
|
|
add x3, x3, #7
|
|
|
|
sub x1, x1, x2
|
|
|
|
bl X(ff_hevc_put_hevc_qpel_h8_8_neon_i8mm)
|
|
|
|
ldp x0, x3, [sp, #16]
|
|
|
|
ldp x5, x30, [sp], #32
|
|
|
|
mov x7, #128
|
|
|
|
load_qpel_filterh x5, x4
|
|
|
|
ldr q16, [sp]
|
|
|
|
ldr q17, [sp, x7]
|
|
|
|
add sp, sp, x7, lsl #1
|
|
|
|
ldr q18, [sp]
|
|
|
|
ldr q19, [sp, x7]
|
|
|
|
add sp, sp, x7, lsl #1
|
|
|
|
ldr q20, [sp]
|
|
|
|
ldr q21, [sp, x7]
|
|
|
|
add sp, sp, x7, lsl #1
|
|
|
|
ldr q22, [sp]
|
|
|
|
add sp, sp, x7
|
|
|
|
.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
|
|
|
|
ld1 {\tmp\().8h}, [sp], x7
|
|
|
|
calc_qpelh v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqshrn
|
|
|
|
calc_qpelh2 v1, v2, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqshrn2
|
|
|
|
subs w3, w3, #1
|
|
|
|
st1 {v1.8h}, [x0], x7
|
|
|
|
.endm
|
|
|
|
1: calc_all
|
|
|
|
.purgem calc
|
|
|
|
2: ret
|
|
|
|
endfunc
|
|
|
|
|
|
|
|
function ff_hevc_put_hevc_qpel_hv12_8_neon_i8mm, export=1
|
|
|
|
add w10, w3, #7
|
|
|
|
lsl x10, x10, #7
|
|
|
|
sub x1, x1, x2, lsl #1
|
|
|
|
sub sp, sp, x10 // tmp_array
|
|
|
|
stp x5, x30, [sp, #-32]!
|
|
|
|
stp x0, x3, [sp, #16]
|
|
|
|
add x0, sp, #32
|
|
|
|
add x3, x3, #7
|
|
|
|
sub x1, x1, x2
|
|
|
|
bl X(ff_hevc_put_hevc_qpel_h12_8_neon_i8mm)
|
|
|
|
ldp x0, x3, [sp, #16]
|
|
|
|
ldp x5, x30, [sp], #32
|
|
|
|
mov x7, #128
|
|
|
|
load_qpel_filterh x5, x4
|
|
|
|
mov x8, #112
|
|
|
|
ld1 {v16.8h, v17.8h}, [sp], x7
|
|
|
|
ld1 {v18.8h, v19.8h}, [sp], x7
|
|
|
|
ld1 {v20.8h, v21.8h}, [sp], x7
|
|
|
|
ld1 {v22.8h, v23.8h}, [sp], x7
|
|
|
|
ld1 {v24.8h, v25.8h}, [sp], x7
|
|
|
|
ld1 {v26.8h, v27.8h}, [sp], x7
|
|
|
|
ld1 {v28.8h, v29.8h}, [sp], x7
|
|
|
|
.macro calc tmp0, tmp1, src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11, src12, src13, src14, src15
|
|
|
|
ld1 {\tmp0\().8h, \tmp1\().8h}, [sp], x7
|
|
|
|
calc_qpelh v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqshrn
|
|
|
|
calc_qpelh2 v1, v2, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqshrn2
|
|
|
|
calc_qpelh v2, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15, sqshrn
|
|
|
|
st1 {v1.8h}, [x0], #16
|
|
|
|
subs w3, w3, #1
|
|
|
|
st1 {v2.4h}, [x0], x8
|
|
|
|
.endm
|
|
|
|
1: calc_all2
|
|
|
|
.purgem calc
|
|
|
|
2: ret
|
|
|
|
endfunc
|
|
|
|
|
|
|
|
function ff_hevc_put_hevc_qpel_hv16_8_neon_i8mm, export=1
|
|
|
|
add w10, w3, #7
|
|
|
|
lsl x10, x10, #7
|
|
|
|
sub x1, x1, x2, lsl #1
|
|
|
|
sub sp, sp, x10 // tmp_array
|
|
|
|
stp x5, x30, [sp, #-32]!
|
|
|
|
stp x0, x3, [sp, #16]
|
|
|
|
add x3, x3, #7
|
|
|
|
add x0, sp, #32
|
|
|
|
sub x1, x1, x2
|
|
|
|
bl X(ff_hevc_put_hevc_qpel_h16_8_neon_i8mm)
|
|
|
|
ldp x0, x3, [sp, #16]
|
|
|
|
ldp x5, x30, [sp], #32
|
|
|
|
mov x7, #128
|
|
|
|
load_qpel_filterh x5, x4
|
|
|
|
ld1 {v16.8h, v17.8h}, [sp], x7
|
|
|
|
ld1 {v18.8h, v19.8h}, [sp], x7
|
|
|
|
ld1 {v20.8h, v21.8h}, [sp], x7
|
|
|
|
ld1 {v22.8h, v23.8h}, [sp], x7
|
|
|
|
ld1 {v24.8h, v25.8h}, [sp], x7
|
|
|
|
ld1 {v26.8h, v27.8h}, [sp], x7
|
|
|
|
ld1 {v28.8h, v29.8h}, [sp], x7
|
|
|
|
.macro calc tmp0, tmp1, src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11, src12, src13, src14, src15
|
|
|
|
ld1 {\tmp0\().8h, \tmp1\().8h}, [sp], x7
|
|
|
|
calc_qpelh v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqshrn
|
|
|
|
calc_qpelh2 v1, v2, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqshrn2
|
|
|
|
calc_qpelh v2, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15, sqshrn
|
|
|
|
calc_qpelh2 v2, v3, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15, sqshrn2
|
|
|
|
subs w3, w3, #1
|
|
|
|
st1 {v1.8h, v2.8h}, [x0], x7
|
|
|
|
.endm
|
|
|
|
1: calc_all2
|
|
|
|
.purgem calc
|
|
|
|
2: ret
|
|
|
|
endfunc
|
|
|
|
|
|
|
|
function ff_hevc_put_hevc_qpel_hv24_8_neon_i8mm, export=1
|
|
|
|
stp x4, x5, [sp, #-64]!
|
|
|
|
stp x2, x3, [sp, #16]
|
|
|
|
stp x0, x1, [sp, #32]
|
|
|
|
str x30, [sp, #48]
|
|
|
|
bl X(ff_hevc_put_hevc_qpel_hv12_8_neon_i8mm)
|
|
|
|
ldp x0, x1, [sp, #32]
|
|
|
|
ldp x2, x3, [sp, #16]
|
|
|
|
ldp x4, x5, [sp], #48
|
|
|
|
add x1, x1, #12
|
|
|
|
add x0, x0, #24
|
|
|
|
bl X(ff_hevc_put_hevc_qpel_hv12_8_neon_i8mm)
|
|
|
|
ldr x30, [sp], #16
|
|
|
|
ret
|
|
|
|
endfunc
|
|
|
|
|
|
|
|
function ff_hevc_put_hevc_qpel_hv32_8_neon_i8mm, export=1
|
|
|
|
add w10, w3, #7
|
|
|
|
sub x1, x1, x2, lsl #1
|
|
|
|
lsl x10, x10, #7
|
|
|
|
sub x1, x1, x2
|
|
|
|
sub sp, sp, x10 // tmp_array
|
|
|
|
stp x5, x30, [sp, #-32]!
|
|
|
|
stp x0, x3, [sp, #16]
|
|
|
|
add x3, x3, #7
|
|
|
|
add x0, sp, #32
|
|
|
|
bl X(ff_hevc_put_hevc_qpel_h32_8_neon_i8mm)
|
|
|
|
ldp x0, x3, [sp, #16]
|
|
|
|
ldp x5, x30, [sp], #32
|
|
|
|
mov x7, #128
|
|
|
|
load_qpel_filterh x5, x4
|
|
|
|
0: mov x8, sp // src
|
|
|
|
ld1 {v16.8h, v17.8h}, [x8], x7
|
|
|
|
mov w9, w3 // height
|
|
|
|
ld1 {v18.8h, v19.8h}, [x8], x7
|
|
|
|
mov x5, x0 // dst
|
|
|
|
ld1 {v20.8h, v21.8h}, [x8], x7
|
|
|
|
ld1 {v22.8h, v23.8h}, [x8], x7
|
|
|
|
ld1 {v24.8h, v25.8h}, [x8], x7
|
|
|
|
ld1 {v26.8h, v27.8h}, [x8], x7
|
|
|
|
ld1 {v28.8h, v29.8h}, [x8], x7
|
|
|
|
.macro calc tmp0, tmp1, src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11, src12, src13, src14, src15
|
|
|
|
ld1 {\tmp0\().8h, \tmp1\().8h}, [x8], x7
|
|
|
|
calc_qpelh v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqshrn
|
|
|
|
calc_qpelh2 v1, v2, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqshrn2
|
|
|
|
calc_qpelh v2, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15, sqshrn
|
|
|
|
calc_qpelh2 v2, v3, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15, sqshrn2
|
|
|
|
subs x9, x9, #1
|
|
|
|
st1 {v1.8h, v2.8h}, [x5], x7
|
|
|
|
.endm
|
|
|
|
1: calc_all2
|
|
|
|
.purgem calc
|
|
|
|
2: add x0, x0, #32
|
|
|
|
add sp, sp, #32
|
|
|
|
subs w6, w6, #16
|
|
|
|
b.hi 0b
|
|
|
|
add w10, w3, #6
|
|
|
|
add sp, sp, #64 // discard rest of first line
|
|
|
|
lsl x10, x10, #7
|
|
|
|
add sp, sp, x10 // tmp_array without first line
|
|
|
|
ret
|
|
|
|
endfunc
|
|
|
|
|
|
|
|
function ff_hevc_put_hevc_qpel_hv48_8_neon_i8mm, export=1
|
|
|
|
stp x4, x5, [sp, #-64]!
|
|
|
|
stp x2, x3, [sp, #16]
|
|
|
|
stp x0, x1, [sp, #32]
|
|
|
|
str x30, [sp, #48]
|
|
|
|
bl X(ff_hevc_put_hevc_qpel_hv24_8_neon_i8mm)
|
|
|
|
ldp x0, x1, [sp, #32]
|
|
|
|
ldp x2, x3, [sp, #16]
|
|
|
|
ldp x4, x5, [sp], #48
|
|
|
|
add x1, x1, #24
|
|
|
|
add x0, x0, #48
|
|
|
|
bl X(ff_hevc_put_hevc_qpel_hv24_8_neon_i8mm)
|
|
|
|
ldr x30, [sp], #16
|
|
|
|
ret
|
|
|
|
endfunc
|
|
|
|
|
|
|
|
function ff_hevc_put_hevc_qpel_hv64_8_neon_i8mm, export=1
|
|
|
|
stp x4, x5, [sp, #-64]!
|
|
|
|
stp x2, x3, [sp, #16]
|
|
|
|
stp x0, x1, [sp, #32]
|
|
|
|
str x30, [sp, #48]
|
|
|
|
mov x6, #32
|
|
|
|
bl X(ff_hevc_put_hevc_qpel_hv32_8_neon_i8mm)
|
|
|
|
ldp x0, x1, [sp, #32]
|
|
|
|
ldp x2, x3, [sp, #16]
|
|
|
|
ldp x4, x5, [sp], #48
|
|
|
|
add x1, x1, #32
|
|
|
|
add x0, x0, #64
|
|
|
|
mov x6, #32
|
|
|
|
bl X(ff_hevc_put_hevc_qpel_hv32_8_neon_i8mm)
|
|
|
|
ldr x30, [sp], #16
|
|
|
|
ret
|
|
|
|
endfunc
|
|
|
|
|
|
|
|
.macro QPEL_UNI_W_HV_HEADER width
|
|
|
|
ldp x14, x15, [sp] // mx, my
|
|
|
|
ldr w13, [sp, #16] // width
|
|
|
|
stp x19, x30, [sp, #-80]!
|
|
|
|
stp x20, x21, [sp, #16]
|
|
|
|
stp x22, x23, [sp, #32]
|
|
|
|
stp x24, x25, [sp, #48]
|
|
|
|
stp x26, x27, [sp, #64]
|
|
|
|
mov x19, sp
|
|
|
|
mov x11, #9088
|
|
|
|
sub sp, sp, x11
|
|
|
|
mov x20, x0
|
|
|
|
mov x21, x1
|
|
|
|
mov x0, sp
|
|
|
|
sub x1, x2, x3, lsl #1
|
|
|
|
sub x1, x1, x3
|
|
|
|
mov x2, x3
|
|
|
|
add w3, w4, #7
|
|
|
|
mov w22, w4 // height
|
|
|
|
mov x4, x14 // mx
|
|
|
|
mov x23, x15 // my
|
|
|
|
mov w24, w6 // wx
|
|
|
|
mov w25, w7 // ox
|
|
|
|
mov w26, #-6
|
|
|
|
sub w26, w26, w5 // -shift
|
|
|
|
mov w27, w13 // width
|
|
|
|
bl X(ff_hevc_put_hevc_qpel_h\width\()_8_neon_i8mm)
|
|
|
|
movrel x9, qpel_filters
|
|
|
|
add x9, x9, x23, lsl #3
|
|
|
|
ld1 {v0.8b}, [x9]
|
|
|
|
sxtl v0.8h, v0.8b
|
|
|
|
mov x10, #(MAX_PB_SIZE * 2)
|
|
|
|
dup v28.4s, w24
|
|
|
|
dup v29.4s, w25
|
|
|
|
dup v30.4s, w26
|
|
|
|
.endm
|
|
|
|
|
|
|
|
.macro QPEL_UNI_W_HV_END
|
|
|
|
mov sp, x19
|
|
|
|
ldp x20, x21, [sp, #16]
|
|
|
|
ldp x22, x23, [sp, #32]
|
|
|
|
ldp x24, x25, [sp, #48]
|
|
|
|
ldp x26, x27, [sp, #64]
|
|
|
|
ldp x19, x30, [sp], #80
|
|
|
|
.endm
|
|
|
|
|
|
|
|
.macro QPEL_UNI_W_HV_4
|
|
|
|
sshr v26.4s, v26.4s, #6
|
|
|
|
mul v24.4s, v26.4s, v28.4s
|
|
|
|
sqrshl v24.4s, v24.4s, v30.4s
|
|
|
|
sqadd v24.4s, v24.4s, v29.4s
|
|
|
|
sqxtn v24.4h, v24.4s
|
|
|
|
sqxtun v24.8b, v24.8h
|
|
|
|
st1 {v24.s}[0], [x20], x21
|
|
|
|
.endm
|
|
|
|
|
|
|
|
.macro QPEL_FILTER_H dst, src0, src1, src2, src3, src4, src5, src6, src7
|
|
|
|
smull \dst\().4s, \src0\().4h, v0.h[0]
|
|
|
|
smlal \dst\().4s, \src1\().4h, v0.h[1]
|
|
|
|
smlal \dst\().4s, \src2\().4h, v0.h[2]
|
|
|
|
smlal \dst\().4s, \src3\().4h, v0.h[3]
|
|
|
|
smlal \dst\().4s, \src4\().4h, v0.h[4]
|
|
|
|
smlal \dst\().4s, \src5\().4h, v0.h[5]
|
|
|
|
smlal \dst\().4s, \src6\().4h, v0.h[6]
|
|
|
|
smlal \dst\().4s, \src7\().4h, v0.h[7]
|
|
|
|
.endm
|
|
|
|
|
|
|
|
.macro QPEL_FILTER_H2 dst, src0, src1, src2, src3, src4, src5, src6, src7
|
|
|
|
smull2 \dst\().4s, \src0\().8h, v0.h[0]
|
|
|
|
smlal2 \dst\().4s, \src1\().8h, v0.h[1]
|
|
|
|
smlal2 \dst\().4s, \src2\().8h, v0.h[2]
|
|
|
|
smlal2 \dst\().4s, \src3\().8h, v0.h[3]
|
|
|
|
smlal2 \dst\().4s, \src4\().8h, v0.h[4]
|
|
|
|
smlal2 \dst\().4s, \src5\().8h, v0.h[5]
|
|
|
|
smlal2 \dst\().4s, \src6\().8h, v0.h[6]
|
|
|
|
smlal2 \dst\().4s, \src7\().8h, v0.h[7]
|
|
|
|
.endm
|
|
|
|
|
|
|
|
function ff_hevc_put_hevc_qpel_uni_w_hv4_8_neon_i8mm, export=1
|
|
|
|
QPEL_UNI_W_HV_HEADER 4
|
|
|
|
ldr d16, [sp]
|
|
|
|
ldr d17, [sp, x10]
|
|
|
|
add sp, sp, x10, lsl #1
|
|
|
|
ldr d18, [sp]
|
|
|
|
ldr d19, [sp, x10]
|
|
|
|
add sp, sp, x10, lsl #1
|
|
|
|
ldr d20, [sp]
|
|
|
|
ldr d21, [sp, x10]
|
|
|
|
add sp, sp, x10, lsl #1
|
|
|
|
ldr d22, [sp]
|
|
|
|
add sp, sp, x10
|
|
|
|
1:
|
|
|
|
ldr d23, [sp]
|
|
|
|
add sp, sp, x10
|
|
|
|
QPEL_FILTER_H v26, v16, v17, v18, v19, v20, v21, v22, v23
|
|
|
|
QPEL_UNI_W_HV_4
|
|
|
|
subs w22, w22, #1
|
|
|
|
b.eq 2f
|
|
|
|
|
|
|
|
ldr d16, [sp]
|
|
|
|
add sp, sp, x10
|
|
|
|
QPEL_FILTER_H v26, v17, v18, v19, v20, v21, v22, v23, v16
|
|
|
|
QPEL_UNI_W_HV_4
|
|
|
|
subs w22, w22, #1
|
|
|
|
b.eq 2f
|
|
|
|
|
|
|
|
ldr d17, [sp]
|
|
|
|
add sp, sp, x10
|
|
|
|
QPEL_FILTER_H v26, v18, v19, v20, v21, v22, v23, v16, v17
|
|
|
|
QPEL_UNI_W_HV_4
|
|
|
|
subs w22, w22, #1
|
|
|
|
b.eq 2f
|
|
|
|
|
|
|
|
ldr d18, [sp]
|
|
|
|
add sp, sp, x10
|
|
|
|
QPEL_FILTER_H v26, v19, v20, v21, v22, v23, v16, v17, v18
|
|
|
|
QPEL_UNI_W_HV_4
|
|
|
|
subs w22, w22, #1
|
|
|
|
b.eq 2f
|
|
|
|
|
|
|
|
ldr d19, [sp]
|
|
|
|
add sp, sp, x10
|
|
|
|
QPEL_FILTER_H v26, v20, v21, v22, v23, v16, v17, v18, v19
|
|
|
|
QPEL_UNI_W_HV_4
|
|
|
|
subs w22, w22, #1
|
|
|
|
b.eq 2f
|
|
|
|
|
|
|
|
ldr d20, [sp]
|
|
|
|
add sp, sp, x10
|
|
|
|
QPEL_FILTER_H v26, v21, v22, v23, v16, v17, v18, v19, v20
|
|
|
|
QPEL_UNI_W_HV_4
|
|
|
|
subs w22, w22, #1
|
|
|
|
b.eq 2f
|
|
|
|
|
|
|
|
ldr d21, [sp]
|
|
|
|
add sp, sp, x10
|
|
|
|
QPEL_FILTER_H v26, v22, v23, v16, v17, v18, v19, v20, v21
|
|
|
|
QPEL_UNI_W_HV_4
|
|
|
|
subs w22, w22, #1
|
|
|
|
b.eq 2f
|
|
|
|
|
|
|
|
ldr d22, [sp]
|
|
|
|
add sp, sp, x10
|
|
|
|
QPEL_FILTER_H v26, v23, v16, v17, v18, v19, v20, v21, v22
|
|
|
|
QPEL_UNI_W_HV_4
|
|
|
|
subs w22, w22, #1
|
|
|
|
b.hi 1b
|
|
|
|
|
|
|
|
2:
|
|
|
|
QPEL_UNI_W_HV_END
|
|
|
|
ret
|
|
|
|
endfunc
|
|
|
|
|
|
|
|
.macro QPEL_UNI_W_HV_8
|
|
|
|
sshr v26.4s, v26.4s, #6
|
|
|
|
sshr v27.4s, v27.4s, #6
|
|
|
|
mul v24.4s, v26.4s, v28.4s
|
|
|
|
mul v25.4s, v27.4s, v28.4s
|
|
|
|
sqrshl v24.4s, v24.4s, v30.4s
|
|
|
|
sqrshl v25.4s, v25.4s, v30.4s
|
|
|
|
sqadd v24.4s, v24.4s, v29.4s
|
|
|
|
sqadd v25.4s, v25.4s, v29.4s
|
|
|
|
sqxtn v24.4h, v24.4s
|
|
|
|
sqxtn2 v24.8h, v25.4s
|
|
|
|
sqxtun v24.8b, v24.8h
|
|
|
|
st1 {v24.d}[0], [x20], x21
|
|
|
|
.endm
|
|
|
|
|
|
|
|
function ff_hevc_put_hevc_qpel_uni_w_hv8_8_neon_i8mm, export=1
|
|
|
|
QPEL_UNI_W_HV_HEADER 8
|
|
|
|
ldr q16, [sp]
|
|
|
|
ldr q17, [sp, x10]
|
|
|
|
add sp, sp, x10, lsl #1
|
|
|
|
ldr q18, [sp]
|
|
|
|
ldr q19, [sp, x10]
|
|
|
|
add sp, sp, x10, lsl #1
|
|
|
|
ldr q20, [sp]
|
|
|
|
ldr q21, [sp, x10]
|
|
|
|
add sp, sp, x10, lsl #1
|
|
|
|
ldr q22, [sp]
|
|
|
|
add sp, sp, x10
|
|
|
|
1:
|
|
|
|
ldr q23, [sp]
|
|
|
|
add sp, sp, x10
|
|
|
|
QPEL_FILTER_H v26, v16, v17, v18, v19, v20, v21, v22, v23
|
|
|
|
QPEL_FILTER_H2 v27, v16, v17, v18, v19, v20, v21, v22, v23
|
|
|
|
QPEL_UNI_W_HV_8
|
|
|
|
subs w22, w22, #1
|
|
|
|
b.eq 2f
|
|
|
|
|
|
|
|
ldr q16, [sp]
|
|
|
|
add sp, sp, x10
|
|
|
|
QPEL_FILTER_H v26, v17, v18, v19, v20, v21, v22, v23, v16
|
|
|
|
QPEL_FILTER_H2 v27, v17, v18, v19, v20, v21, v22, v23, v16
|
|
|
|
QPEL_UNI_W_HV_8
|
|
|
|
subs w22, w22, #1
|
|
|
|
b.eq 2f
|
|
|
|
|
|
|
|
ldr q17, [sp]
|
|
|
|
add sp, sp, x10
|
|
|
|
QPEL_FILTER_H v26, v18, v19, v20, v21, v22, v23, v16, v17
|
|
|
|
QPEL_FILTER_H2 v27, v18, v19, v20, v21, v22, v23, v16, v17
|
|
|
|
QPEL_UNI_W_HV_8
|
|
|
|
subs w22, w22, #1
|
|
|
|
b.eq 2f
|
|
|
|
|
|
|
|
ldr q18, [sp]
|
|
|
|
add sp, sp, x10
|
|
|
|
QPEL_FILTER_H v26, v19, v20, v21, v22, v23, v16, v17, v18
|
|
|
|
QPEL_FILTER_H2 v27, v19, v20, v21, v22, v23, v16, v17, v18
|
|
|
|
QPEL_UNI_W_HV_8
|
|
|
|
subs w22, w22, #1
|
|
|
|
b.eq 2f
|
|
|
|
|
|
|
|
ldr q19, [sp]
|
|
|
|
add sp, sp, x10
|
|
|
|
QPEL_FILTER_H v26, v20, v21, v22, v23, v16, v17, v18, v19
|
|
|
|
QPEL_FILTER_H2 v27, v20, v21, v22, v23, v16, v17, v18, v19
|
|
|
|
QPEL_UNI_W_HV_8
|
|
|
|
subs w22, w22, #1
|
|
|
|
b.eq 2f
|
|
|
|
|
|
|
|
ldr q20, [sp]
|
|
|
|
add sp, sp, x10
|
|
|
|
QPEL_FILTER_H v26, v21, v22, v23, v16, v17, v18, v19, v20
|
|
|
|
QPEL_FILTER_H2 v27, v21, v22, v23, v16, v17, v18, v19, v20
|
|
|
|
QPEL_UNI_W_HV_8
|
|
|
|
subs w22, w22, #1
|
|
|
|
b.eq 2f
|
|
|
|
|
|
|
|
ldr q21, [sp]
|
|
|
|
add sp, sp, x10
|
|
|
|
QPEL_FILTER_H v26, v22, v23, v16, v17, v18, v19, v20, v21
|
|
|
|
QPEL_FILTER_H2 v27, v22, v23, v16, v17, v18, v19, v20, v21
|
|
|
|
QPEL_UNI_W_HV_8
|
|
|
|
subs w22, w22, #1
|
|
|
|
b.eq 2f
|
|
|
|
|
|
|
|
ldr q22, [sp]
|
|
|
|
add sp, sp, x10
|
|
|
|
QPEL_FILTER_H v26, v23, v16, v17, v18, v19, v20, v21, v22
|
|
|
|
QPEL_FILTER_H2 v27, v23, v16, v17, v18, v19, v20, v21, v22
|
|
|
|
QPEL_UNI_W_HV_8
|
|
|
|
subs w22, w22, #1
|
|
|
|
b.hi 1b
|
|
|
|
|
|
|
|
2:
|
|
|
|
QPEL_UNI_W_HV_END
|
|
|
|
ret
|
|
|
|
endfunc
|
|
|
|
|
|
|
|
.macro QPEL_UNI_W_HV_16
|
|
|
|
sshr v24.4s, v24.4s, #6
|
|
|
|
sshr v25.4s, v25.4s, #6
|
|
|
|
sshr v26.4s, v26.4s, #6
|
|
|
|
sshr v27.4s, v27.4s, #6
|
|
|
|
mul v24.4s, v24.4s, v28.4s
|
|
|
|
mul v25.4s, v25.4s, v28.4s
|
|
|
|
mul v26.4s, v26.4s, v28.4s
|
|
|
|
mul v27.4s, v27.4s, v28.4s
|
|
|
|
sqrshl v24.4s, v24.4s, v30.4s
|
|
|
|
sqrshl v25.4s, v25.4s, v30.4s
|
|
|
|
sqrshl v26.4s, v26.4s, v30.4s
|
|
|
|
sqrshl v27.4s, v27.4s, v30.4s
|
|
|
|
sqadd v24.4s, v24.4s, v29.4s
|
|
|
|
sqadd v25.4s, v25.4s, v29.4s
|
|
|
|
sqadd v26.4s, v26.4s, v29.4s
|
|
|
|
sqadd v27.4s, v27.4s, v29.4s
|
|
|
|
sqxtn v24.4h, v24.4s
|
|
|
|
sqxtn2 v24.8h, v25.4s
|
|
|
|
sqxtn v26.4h, v26.4s
|
|
|
|
sqxtn2 v26.8h, v27.4s
|
|
|
|
sqxtun v24.8b, v24.8h
|
|
|
|
sqxtun2 v24.16b, v26.8h
|
|
|
|
|
|
|
|
st1 {v24.16b}, [x20], x21
|
|
|
|
.endm
|
|
|
|
|
|
|
|
function ff_hevc_put_hevc_qpel_uni_w_hv16_8_neon_i8mm, export=1
|
|
|
|
QPEL_UNI_W_HV_HEADER 16
|
|
|
|
ldp q16, q1, [sp]
|
|
|
|
add sp, sp, x10
|
|
|
|
ldp q17, q2, [sp]
|
|
|
|
add sp, sp, x10
|
|
|
|
ldp q18, q3, [sp]
|
|
|
|
add sp, sp, x10
|
|
|
|
ldp q19, q4, [sp]
|
|
|
|
add sp, sp, x10
|
|
|
|
ldp q20, q5, [sp]
|
|
|
|
add sp, sp, x10
|
|
|
|
ldp q21, q6, [sp]
|
|
|
|
add sp, sp, x10
|
|
|
|
ldp q22, q7, [sp]
|
|
|
|
add sp, sp, x10
|
|
|
|
1:
|
|
|
|
ldp q23, q31, [sp]
|
|
|
|
add sp, sp, x10
|
|
|
|
QPEL_FILTER_H v24, v16, v17, v18, v19, v20, v21, v22, v23
|
|
|
|
QPEL_FILTER_H2 v25, v16, v17, v18, v19, v20, v21, v22, v23
|
|
|
|
QPEL_FILTER_H v26, v1, v2, v3, v4, v5, v6, v7, v31
|
|
|
|
QPEL_FILTER_H2 v27, v1, v2, v3, v4, v5, v6, v7, v31
|
|
|
|
QPEL_UNI_W_HV_16
|
|
|
|
subs w22, w22, #1
|
|
|
|
b.eq 2f
|
|
|
|
|
|
|
|
ldp q16, q1, [sp]
|
|
|
|
add sp, sp, x10
|
|
|
|
QPEL_FILTER_H v24, v17, v18, v19, v20, v21, v22, v23, v16
|
|
|
|
QPEL_FILTER_H2 v25, v17, v18, v19, v20, v21, v22, v23, v16
|
|
|
|
QPEL_FILTER_H v26, v2, v3, v4, v5, v6, v7, v31, v1
|
|
|
|
QPEL_FILTER_H2 v27, v2, v3, v4, v5, v6, v7, v31, v1
|
|
|
|
QPEL_UNI_W_HV_16
|
|
|
|
subs w22, w22, #1
|
|
|
|
b.eq 2f
|
|
|
|
|
|
|
|
ldp q17, q2, [sp]
|
|
|
|
add sp, sp, x10
|
|
|
|
QPEL_FILTER_H v24, v18, v19, v20, v21, v22, v23, v16, v17
|
|
|
|
QPEL_FILTER_H2 v25, v18, v19, v20, v21, v22, v23, v16, v17
|
|
|
|
QPEL_FILTER_H v26, v3, v4, v5, v6, v7, v31, v1, v2
|
|
|
|
QPEL_FILTER_H2 v27, v3, v4, v5, v6, v7, v31, v1, v2
|
|
|
|
QPEL_UNI_W_HV_16
|
|
|
|
subs w22, w22, #1
|
|
|
|
b.eq 2f
|
|
|
|
|
|
|
|
ldp q18, q3, [sp]
|
|
|
|
add sp, sp, x10
|
|
|
|
QPEL_FILTER_H v24, v19, v20, v21, v22, v23, v16, v17, v18
|
|
|
|
QPEL_FILTER_H2 v25, v19, v20, v21, v22, v23, v16, v17, v18
|
|
|
|
QPEL_FILTER_H v26, v4, v5, v6, v7, v31, v1, v2, v3
|
|
|
|
QPEL_FILTER_H2 v27, v4, v5, v6, v7, v31, v1, v2, v3
|
|
|
|
QPEL_UNI_W_HV_16
|
|
|
|
subs w22, w22, #1
|
|
|
|
b.eq 2f
|
|
|
|
|
|
|
|
ldp q19, q4, [sp]
|
|
|
|
add sp, sp, x10
|
|
|
|
QPEL_FILTER_H v24, v20, v21, v22, v23, v16, v17, v18, v19
|
|
|
|
QPEL_FILTER_H2 v25, v20, v21, v22, v23, v16, v17, v18, v19
|
|
|
|
QPEL_FILTER_H v26, v5, v6, v7, v31, v1, v2, v3, v4
|
|
|
|
QPEL_FILTER_H2 v27, v5, v6, v7, v31, v1, v2, v3, v4
|
|
|
|
QPEL_UNI_W_HV_16
|
|
|
|
subs w22, w22, #1
|
|
|
|
b.eq 2f
|
|
|
|
|
|
|
|
ldp q20, q5, [sp]
|
|
|
|
add sp, sp, x10
|
|
|
|
QPEL_FILTER_H v24, v21, v22, v23, v16, v17, v18, v19, v20
|
|
|
|
QPEL_FILTER_H2 v25, v21, v22, v23, v16, v17, v18, v19, v20
|
|
|
|
QPEL_FILTER_H v26, v6, v7, v31, v1, v2, v3, v4, v5
|
|
|
|
QPEL_FILTER_H2 v27, v6, v7, v31, v1, v2, v3, v4, v5
|
|
|
|
QPEL_UNI_W_HV_16
|
|
|
|
subs w22, w22, #1
|
|
|
|
b.eq 2f
|
|
|
|
|
|
|
|
ldp q21, q6, [sp]
|
|
|
|
add sp, sp, x10
|
|
|
|
QPEL_FILTER_H v24, v22, v23, v16, v17, v18, v19, v20, v21
|
|
|
|
QPEL_FILTER_H2 v25, v22, v23, v16, v17, v18, v19, v20, v21
|
|
|
|
QPEL_FILTER_H v26, v7, v31, v1, v2, v3, v4, v5, v6
|
|
|
|
QPEL_FILTER_H2 v27, v7, v31, v1, v2, v3, v4, v5, v6
|
|
|
|
QPEL_UNI_W_HV_16
|
|
|
|
subs w22, w22, #1
|
|
|
|
b.eq 2f
|
|
|
|
|
|
|
|
ldp q22, q7, [sp]
|
|
|
|
add sp, sp, x10
|
|
|
|
QPEL_FILTER_H v24, v23, v16, v17, v18, v19, v20, v21, v22
|
|
|
|
QPEL_FILTER_H2 v25, v23, v16, v17, v18, v19, v20, v21, v22
|
|
|
|
QPEL_FILTER_H v26, v31, v1, v2, v3, v4, v5, v6, v7
|
|
|
|
QPEL_FILTER_H2 v27, v31, v1, v2, v3, v4, v5, v6, v7
|
|
|
|
QPEL_UNI_W_HV_16
|
|
|
|
subs w22, w22, #1
|
|
|
|
b.hi 1b
|
|
|
|
|
|
|
|
2:
|
|
|
|
QPEL_UNI_W_HV_END
|
|
|
|
ret
|
|
|
|
endfunc
|
|
|
|
|
|
|
|
|
|
|
|
function ff_hevc_put_hevc_qpel_uni_w_hv32_8_neon_i8mm, export=1
|
|
|
|
QPEL_UNI_W_HV_HEADER 32
|
|
|
|
mov x11, sp
|
|
|
|
mov w12, w22
|
|
|
|
mov x13, x20
|
aarch64: hevc: Don't iterate with sp in ff_hevc_put_hevc_qpel_uni_w_hv32/64_8_neon_i8mm
Many of the routines within hevcdsp_epel_neon and hevcdsp_qpel_neon
store temporary buffers on the stack. When consuming it,
many of these functions use the stack pointer as incremental pointer
for reading the data (instead of storing it in another register),
which is rather unusual.
Technically, this is fine as long as the pointer remains properly
aligned.
However in the case of ff_hevc_put_hevc_qpel_uni_w_hv64_8_neon_i8mm,
after incrementing sp when reading data (within each 16 pixel
wide stripe) it would then reset the stack pointer back to a lower
value, for reading the next 16 pixel wide stripe, expecting the
data to remain untouched.
This can't be assumed; data on the stack below the stack pointer
can be clobbered (e.g. by a signal handler). Some OS ABIs
allow for a little margin that won't be touched, aka a red zone,
but not all do. The ones that do, guarantee 16 or 128 bytes, not
9 KB.
Convert this function to use a separate pointer register to
iterate through the data, retaining the stack pointer to point
at the bottom of the data we require to remain untouched.
Signed-off-by: Martin Storsjö <martin@martin.st>
10 months ago
|
|
|
mov x14, sp
|
|
|
|
3:
|
aarch64: hevc: Don't iterate with sp in ff_hevc_put_hevc_qpel_uni_w_hv32/64_8_neon_i8mm
Many of the routines within hevcdsp_epel_neon and hevcdsp_qpel_neon
store temporary buffers on the stack. When consuming it,
many of these functions use the stack pointer as incremental pointer
for reading the data (instead of storing it in another register),
which is rather unusual.
Technically, this is fine as long as the pointer remains properly
aligned.
However in the case of ff_hevc_put_hevc_qpel_uni_w_hv64_8_neon_i8mm,
after incrementing sp when reading data (within each 16 pixel
wide stripe) it would then reset the stack pointer back to a lower
value, for reading the next 16 pixel wide stripe, expecting the
data to remain untouched.
This can't be assumed; data on the stack below the stack pointer
can be clobbered (e.g. by a signal handler). Some OS ABIs
allow for a little margin that won't be touched, aka a red zone,
but not all do. The ones that do, guarantee 16 or 128 bytes, not
9 KB.
Convert this function to use a separate pointer register to
iterate through the data, retaining the stack pointer to point
at the bottom of the data we require to remain untouched.
Signed-off-by: Martin Storsjö <martin@martin.st>
10 months ago
|
|
|
ldp q16, q1, [x11]
|
|
|
|
add x11, x11, x10
|
|
|
|
ldp q17, q2, [x11]
|
|
|
|
add x11, x11, x10
|
|
|
|
ldp q18, q3, [x11]
|
|
|
|
add x11, x11, x10
|
|
|
|
ldp q19, q4, [x11]
|
|
|
|
add x11, x11, x10
|
|
|
|
ldp q20, q5, [x11]
|
|
|
|
add x11, x11, x10
|
|
|
|
ldp q21, q6, [x11]
|
|
|
|
add x11, x11, x10
|
|
|
|
ldp q22, q7, [x11]
|
|
|
|
add x11, x11, x10
|
|
|
|
1:
|
aarch64: hevc: Don't iterate with sp in ff_hevc_put_hevc_qpel_uni_w_hv32/64_8_neon_i8mm
Many of the routines within hevcdsp_epel_neon and hevcdsp_qpel_neon
store temporary buffers on the stack. When consuming it,
many of these functions use the stack pointer as incremental pointer
for reading the data (instead of storing it in another register),
which is rather unusual.
Technically, this is fine as long as the pointer remains properly
aligned.
However in the case of ff_hevc_put_hevc_qpel_uni_w_hv64_8_neon_i8mm,
after incrementing sp when reading data (within each 16 pixel
wide stripe) it would then reset the stack pointer back to a lower
value, for reading the next 16 pixel wide stripe, expecting the
data to remain untouched.
This can't be assumed; data on the stack below the stack pointer
can be clobbered (e.g. by a signal handler). Some OS ABIs
allow for a little margin that won't be touched, aka a red zone,
but not all do. The ones that do, guarantee 16 or 128 bytes, not
9 KB.
Convert this function to use a separate pointer register to
iterate through the data, retaining the stack pointer to point
at the bottom of the data we require to remain untouched.
Signed-off-by: Martin Storsjö <martin@martin.st>
10 months ago
|
|
|
ldp q23, q31, [x11]
|
|
|
|
add x11, x11, x10
|
|
|
|
QPEL_FILTER_H v24, v16, v17, v18, v19, v20, v21, v22, v23
|
|
|
|
QPEL_FILTER_H2 v25, v16, v17, v18, v19, v20, v21, v22, v23
|
|
|
|
QPEL_FILTER_H v26, v1, v2, v3, v4, v5, v6, v7, v31
|
|
|
|
QPEL_FILTER_H2 v27, v1, v2, v3, v4, v5, v6, v7, v31
|
|
|
|
QPEL_UNI_W_HV_16
|
|
|
|
subs w22, w22, #1
|
|
|
|
b.eq 2f
|
|
|
|
|
aarch64: hevc: Don't iterate with sp in ff_hevc_put_hevc_qpel_uni_w_hv32/64_8_neon_i8mm
Many of the routines within hevcdsp_epel_neon and hevcdsp_qpel_neon
store temporary buffers on the stack. When consuming it,
many of these functions use the stack pointer as incremental pointer
for reading the data (instead of storing it in another register),
which is rather unusual.
Technically, this is fine as long as the pointer remains properly
aligned.
However in the case of ff_hevc_put_hevc_qpel_uni_w_hv64_8_neon_i8mm,
after incrementing sp when reading data (within each 16 pixel
wide stripe) it would then reset the stack pointer back to a lower
value, for reading the next 16 pixel wide stripe, expecting the
data to remain untouched.
This can't be assumed; data on the stack below the stack pointer
can be clobbered (e.g. by a signal handler). Some OS ABIs
allow for a little margin that won't be touched, aka a red zone,
but not all do. The ones that do, guarantee 16 or 128 bytes, not
9 KB.
Convert this function to use a separate pointer register to
iterate through the data, retaining the stack pointer to point
at the bottom of the data we require to remain untouched.
Signed-off-by: Martin Storsjö <martin@martin.st>
10 months ago
|
|
|
ldp q16, q1, [x11]
|
|
|
|
add x11, x11, x10
|
|
|
|
QPEL_FILTER_H v24, v17, v18, v19, v20, v21, v22, v23, v16
|
|
|
|
QPEL_FILTER_H2 v25, v17, v18, v19, v20, v21, v22, v23, v16
|
|
|
|
QPEL_FILTER_H v26, v2, v3, v4, v5, v6, v7, v31, v1
|
|
|
|
QPEL_FILTER_H2 v27, v2, v3, v4, v5, v6, v7, v31, v1
|
|
|
|
QPEL_UNI_W_HV_16
|
|
|
|
subs w22, w22, #1
|
|
|
|
b.eq 2f
|
|
|
|
|
aarch64: hevc: Don't iterate with sp in ff_hevc_put_hevc_qpel_uni_w_hv32/64_8_neon_i8mm
Many of the routines within hevcdsp_epel_neon and hevcdsp_qpel_neon
store temporary buffers on the stack. When consuming it,
many of these functions use the stack pointer as incremental pointer
for reading the data (instead of storing it in another register),
which is rather unusual.
Technically, this is fine as long as the pointer remains properly
aligned.
However in the case of ff_hevc_put_hevc_qpel_uni_w_hv64_8_neon_i8mm,
after incrementing sp when reading data (within each 16 pixel
wide stripe) it would then reset the stack pointer back to a lower
value, for reading the next 16 pixel wide stripe, expecting the
data to remain untouched.
This can't be assumed; data on the stack below the stack pointer
can be clobbered (e.g. by a signal handler). Some OS ABIs
allow for a little margin that won't be touched, aka a red zone,
but not all do. The ones that do, guarantee 16 or 128 bytes, not
9 KB.
Convert this function to use a separate pointer register to
iterate through the data, retaining the stack pointer to point
at the bottom of the data we require to remain untouched.
Signed-off-by: Martin Storsjö <martin@martin.st>
10 months ago
|
|
|
ldp q17, q2, [x11]
|
|
|
|
add x11, x11, x10
|
|
|
|
QPEL_FILTER_H v24, v18, v19, v20, v21, v22, v23, v16, v17
|
|
|
|
QPEL_FILTER_H2 v25, v18, v19, v20, v21, v22, v23, v16, v17
|
|
|
|
QPEL_FILTER_H v26, v3, v4, v5, v6, v7, v31, v1, v2
|
|
|
|
QPEL_FILTER_H2 v27, v3, v4, v5, v6, v7, v31, v1, v2
|
|
|
|
QPEL_UNI_W_HV_16
|
|
|
|
subs w22, w22, #1
|
|
|
|
b.eq 2f
|
|
|
|
|
aarch64: hevc: Don't iterate with sp in ff_hevc_put_hevc_qpel_uni_w_hv32/64_8_neon_i8mm
Many of the routines within hevcdsp_epel_neon and hevcdsp_qpel_neon
store temporary buffers on the stack. When consuming it,
many of these functions use the stack pointer as incremental pointer
for reading the data (instead of storing it in another register),
which is rather unusual.
Technically, this is fine as long as the pointer remains properly
aligned.
However in the case of ff_hevc_put_hevc_qpel_uni_w_hv64_8_neon_i8mm,
after incrementing sp when reading data (within each 16 pixel
wide stripe) it would then reset the stack pointer back to a lower
value, for reading the next 16 pixel wide stripe, expecting the
data to remain untouched.
This can't be assumed; data on the stack below the stack pointer
can be clobbered (e.g. by a signal handler). Some OS ABIs
allow for a little margin that won't be touched, aka a red zone,
but not all do. The ones that do, guarantee 16 or 128 bytes, not
9 KB.
Convert this function to use a separate pointer register to
iterate through the data, retaining the stack pointer to point
at the bottom of the data we require to remain untouched.
Signed-off-by: Martin Storsjö <martin@martin.st>
10 months ago
|
|
|
ldp q18, q3, [x11]
|
|
|
|
add x11, x11, x10
|
|
|
|
QPEL_FILTER_H v24, v19, v20, v21, v22, v23, v16, v17, v18
|
|
|
|
QPEL_FILTER_H2 v25, v19, v20, v21, v22, v23, v16, v17, v18
|
|
|
|
QPEL_FILTER_H v26, v4, v5, v6, v7, v31, v1, v2, v3
|
|
|
|
QPEL_FILTER_H2 v27, v4, v5, v6, v7, v31, v1, v2, v3
|
|
|
|
QPEL_UNI_W_HV_16
|
|
|
|
subs w22, w22, #1
|
|
|
|
b.eq 2f
|
|
|
|
|
aarch64: hevc: Don't iterate with sp in ff_hevc_put_hevc_qpel_uni_w_hv32/64_8_neon_i8mm
Many of the routines within hevcdsp_epel_neon and hevcdsp_qpel_neon
store temporary buffers on the stack. When consuming it,
many of these functions use the stack pointer as incremental pointer
for reading the data (instead of storing it in another register),
which is rather unusual.
Technically, this is fine as long as the pointer remains properly
aligned.
However in the case of ff_hevc_put_hevc_qpel_uni_w_hv64_8_neon_i8mm,
after incrementing sp when reading data (within each 16 pixel
wide stripe) it would then reset the stack pointer back to a lower
value, for reading the next 16 pixel wide stripe, expecting the
data to remain untouched.
This can't be assumed; data on the stack below the stack pointer
can be clobbered (e.g. by a signal handler). Some OS ABIs
allow for a little margin that won't be touched, aka a red zone,
but not all do. The ones that do, guarantee 16 or 128 bytes, not
9 KB.
Convert this function to use a separate pointer register to
iterate through the data, retaining the stack pointer to point
at the bottom of the data we require to remain untouched.
Signed-off-by: Martin Storsjö <martin@martin.st>
10 months ago
|
|
|
ldp q19, q4, [x11]
|
|
|
|
add x11, x11, x10
|
|
|
|
QPEL_FILTER_H v24, v20, v21, v22, v23, v16, v17, v18, v19
|
|
|
|
QPEL_FILTER_H2 v25, v20, v21, v22, v23, v16, v17, v18, v19
|
|
|
|
QPEL_FILTER_H v26, v5, v6, v7, v31, v1, v2, v3, v4
|
|
|
|
QPEL_FILTER_H2 v27, v5, v6, v7, v31, v1, v2, v3, v4
|
|
|
|
QPEL_UNI_W_HV_16
|
|
|
|
subs w22, w22, #1
|
|
|
|
b.eq 2f
|
|
|
|
|
aarch64: hevc: Don't iterate with sp in ff_hevc_put_hevc_qpel_uni_w_hv32/64_8_neon_i8mm
Many of the routines within hevcdsp_epel_neon and hevcdsp_qpel_neon
store temporary buffers on the stack. When consuming it,
many of these functions use the stack pointer as incremental pointer
for reading the data (instead of storing it in another register),
which is rather unusual.
Technically, this is fine as long as the pointer remains properly
aligned.
However in the case of ff_hevc_put_hevc_qpel_uni_w_hv64_8_neon_i8mm,
after incrementing sp when reading data (within each 16 pixel
wide stripe) it would then reset the stack pointer back to a lower
value, for reading the next 16 pixel wide stripe, expecting the
data to remain untouched.
This can't be assumed; data on the stack below the stack pointer
can be clobbered (e.g. by a signal handler). Some OS ABIs
allow for a little margin that won't be touched, aka a red zone,
but not all do. The ones that do, guarantee 16 or 128 bytes, not
9 KB.
Convert this function to use a separate pointer register to
iterate through the data, retaining the stack pointer to point
at the bottom of the data we require to remain untouched.
Signed-off-by: Martin Storsjö <martin@martin.st>
10 months ago
|
|
|
ldp q20, q5, [x11]
|
|
|
|
add x11, x11, x10
|
|
|
|
QPEL_FILTER_H v24, v21, v22, v23, v16, v17, v18, v19, v20
|
|
|
|
QPEL_FILTER_H2 v25, v21, v22, v23, v16, v17, v18, v19, v20
|
|
|
|
QPEL_FILTER_H v26, v6, v7, v31, v1, v2, v3, v4, v5
|
|
|
|
QPEL_FILTER_H2 v27, v6, v7, v31, v1, v2, v3, v4, v5
|
|
|
|
QPEL_UNI_W_HV_16
|
|
|
|
subs w22, w22, #1
|
|
|
|
b.eq 2f
|
|
|
|
|
aarch64: hevc: Don't iterate with sp in ff_hevc_put_hevc_qpel_uni_w_hv32/64_8_neon_i8mm
Many of the routines within hevcdsp_epel_neon and hevcdsp_qpel_neon
store temporary buffers on the stack. When consuming it,
many of these functions use the stack pointer as incremental pointer
for reading the data (instead of storing it in another register),
which is rather unusual.
Technically, this is fine as long as the pointer remains properly
aligned.
However in the case of ff_hevc_put_hevc_qpel_uni_w_hv64_8_neon_i8mm,
after incrementing sp when reading data (within each 16 pixel
wide stripe) it would then reset the stack pointer back to a lower
value, for reading the next 16 pixel wide stripe, expecting the
data to remain untouched.
This can't be assumed; data on the stack below the stack pointer
can be clobbered (e.g. by a signal handler). Some OS ABIs
allow for a little margin that won't be touched, aka a red zone,
but not all do. The ones that do, guarantee 16 or 128 bytes, not
9 KB.
Convert this function to use a separate pointer register to
iterate through the data, retaining the stack pointer to point
at the bottom of the data we require to remain untouched.
Signed-off-by: Martin Storsjö <martin@martin.st>
10 months ago
|
|
|
ldp q21, q6, [x11]
|
|
|
|
add x11, x11, x10
|
|
|
|
QPEL_FILTER_H v24, v22, v23, v16, v17, v18, v19, v20, v21
|
|
|
|
QPEL_FILTER_H2 v25, v22, v23, v16, v17, v18, v19, v20, v21
|
|
|
|
QPEL_FILTER_H v26, v7, v31, v1, v2, v3, v4, v5, v6
|
|
|
|
QPEL_FILTER_H2 v27, v7, v31, v1, v2, v3, v4, v5, v6
|
|
|
|
QPEL_UNI_W_HV_16
|
|
|
|
subs w22, w22, #1
|
|
|
|
b.eq 2f
|
|
|
|
|
aarch64: hevc: Don't iterate with sp in ff_hevc_put_hevc_qpel_uni_w_hv32/64_8_neon_i8mm
Many of the routines within hevcdsp_epel_neon and hevcdsp_qpel_neon
store temporary buffers on the stack. When consuming it,
many of these functions use the stack pointer as incremental pointer
for reading the data (instead of storing it in another register),
which is rather unusual.
Technically, this is fine as long as the pointer remains properly
aligned.
However in the case of ff_hevc_put_hevc_qpel_uni_w_hv64_8_neon_i8mm,
after incrementing sp when reading data (within each 16 pixel
wide stripe) it would then reset the stack pointer back to a lower
value, for reading the next 16 pixel wide stripe, expecting the
data to remain untouched.
This can't be assumed; data on the stack below the stack pointer
can be clobbered (e.g. by a signal handler). Some OS ABIs
allow for a little margin that won't be touched, aka a red zone,
but not all do. The ones that do, guarantee 16 or 128 bytes, not
9 KB.
Convert this function to use a separate pointer register to
iterate through the data, retaining the stack pointer to point
at the bottom of the data we require to remain untouched.
Signed-off-by: Martin Storsjö <martin@martin.st>
10 months ago
|
|
|
ldp q22, q7, [x11]
|
|
|
|
add x11, x11, x10
|
|
|
|
QPEL_FILTER_H v24, v23, v16, v17, v18, v19, v20, v21, v22
|
|
|
|
QPEL_FILTER_H2 v25, v23, v16, v17, v18, v19, v20, v21, v22
|
|
|
|
QPEL_FILTER_H v26, v31, v1, v2, v3, v4, v5, v6, v7
|
|
|
|
QPEL_FILTER_H2 v27, v31, v1, v2, v3, v4, v5, v6, v7
|
|
|
|
QPEL_UNI_W_HV_16
|
|
|
|
subs w22, w22, #1
|
|
|
|
b.hi 1b
|
|
|
|
2:
|
|
|
|
subs w27, w27, #16
|
aarch64: hevc: Don't iterate with sp in ff_hevc_put_hevc_qpel_uni_w_hv32/64_8_neon_i8mm
Many of the routines within hevcdsp_epel_neon and hevcdsp_qpel_neon
store temporary buffers on the stack. When consuming it,
many of these functions use the stack pointer as incremental pointer
for reading the data (instead of storing it in another register),
which is rather unusual.
Technically, this is fine as long as the pointer remains properly
aligned.
However in the case of ff_hevc_put_hevc_qpel_uni_w_hv64_8_neon_i8mm,
after incrementing sp when reading data (within each 16 pixel
wide stripe) it would then reset the stack pointer back to a lower
value, for reading the next 16 pixel wide stripe, expecting the
data to remain untouched.
This can't be assumed; data on the stack below the stack pointer
can be clobbered (e.g. by a signal handler). Some OS ABIs
allow for a little margin that won't be touched, aka a red zone,
but not all do. The ones that do, guarantee 16 or 128 bytes, not
9 KB.
Convert this function to use a separate pointer register to
iterate through the data, retaining the stack pointer to point
at the bottom of the data we require to remain untouched.
Signed-off-by: Martin Storsjö <martin@martin.st>
10 months ago
|
|
|
add x11, x14, #32
|
|
|
|
add x20, x13, #16
|
|
|
|
mov w22, w12
|
aarch64: hevc: Don't iterate with sp in ff_hevc_put_hevc_qpel_uni_w_hv32/64_8_neon_i8mm
Many of the routines within hevcdsp_epel_neon and hevcdsp_qpel_neon
store temporary buffers on the stack. When consuming it,
many of these functions use the stack pointer as incremental pointer
for reading the data (instead of storing it in another register),
which is rather unusual.
Technically, this is fine as long as the pointer remains properly
aligned.
However in the case of ff_hevc_put_hevc_qpel_uni_w_hv64_8_neon_i8mm,
after incrementing sp when reading data (within each 16 pixel
wide stripe) it would then reset the stack pointer back to a lower
value, for reading the next 16 pixel wide stripe, expecting the
data to remain untouched.
This can't be assumed; data on the stack below the stack pointer
can be clobbered (e.g. by a signal handler). Some OS ABIs
allow for a little margin that won't be touched, aka a red zone,
but not all do. The ones that do, guarantee 16 or 128 bytes, not
9 KB.
Convert this function to use a separate pointer register to
iterate through the data, retaining the stack pointer to point
at the bottom of the data we require to remain untouched.
Signed-off-by: Martin Storsjö <martin@martin.st>
10 months ago
|
|
|
mov x14, x11
|
|
|
|
mov x13, x20
|
|
|
|
b.hi 3b
|
|
|
|
QPEL_UNI_W_HV_END
|
|
|
|
ret
|
|
|
|
endfunc
|
|
|
|
|
|
|
|
function ff_hevc_put_hevc_qpel_uni_w_hv64_8_neon_i8mm, export=1
|
|
|
|
QPEL_UNI_W_HV_HEADER 64
|
|
|
|
mov x11, sp
|
|
|
|
mov w12, w22
|
|
|
|
mov x13, x20
|
aarch64: hevc: Don't iterate with sp in ff_hevc_put_hevc_qpel_uni_w_hv32/64_8_neon_i8mm
Many of the routines within hevcdsp_epel_neon and hevcdsp_qpel_neon
store temporary buffers on the stack. When consuming it,
many of these functions use the stack pointer as incremental pointer
for reading the data (instead of storing it in another register),
which is rather unusual.
Technically, this is fine as long as the pointer remains properly
aligned.
However in the case of ff_hevc_put_hevc_qpel_uni_w_hv64_8_neon_i8mm,
after incrementing sp when reading data (within each 16 pixel
wide stripe) it would then reset the stack pointer back to a lower
value, for reading the next 16 pixel wide stripe, expecting the
data to remain untouched.
This can't be assumed; data on the stack below the stack pointer
can be clobbered (e.g. by a signal handler). Some OS ABIs
allow for a little margin that won't be touched, aka a red zone,
but not all do. The ones that do, guarantee 16 or 128 bytes, not
9 KB.
Convert this function to use a separate pointer register to
iterate through the data, retaining the stack pointer to point
at the bottom of the data we require to remain untouched.
Signed-off-by: Martin Storsjö <martin@martin.st>
10 months ago
|
|
|
mov x14, sp
|
|
|
|
3:
|
aarch64: hevc: Don't iterate with sp in ff_hevc_put_hevc_qpel_uni_w_hv32/64_8_neon_i8mm
Many of the routines within hevcdsp_epel_neon and hevcdsp_qpel_neon
store temporary buffers on the stack. When consuming it,
many of these functions use the stack pointer as incremental pointer
for reading the data (instead of storing it in another register),
which is rather unusual.
Technically, this is fine as long as the pointer remains properly
aligned.
However in the case of ff_hevc_put_hevc_qpel_uni_w_hv64_8_neon_i8mm,
after incrementing sp when reading data (within each 16 pixel
wide stripe) it would then reset the stack pointer back to a lower
value, for reading the next 16 pixel wide stripe, expecting the
data to remain untouched.
This can't be assumed; data on the stack below the stack pointer
can be clobbered (e.g. by a signal handler). Some OS ABIs
allow for a little margin that won't be touched, aka a red zone,
but not all do. The ones that do, guarantee 16 or 128 bytes, not
9 KB.
Convert this function to use a separate pointer register to
iterate through the data, retaining the stack pointer to point
at the bottom of the data we require to remain untouched.
Signed-off-by: Martin Storsjö <martin@martin.st>
10 months ago
|
|
|
ldp q16, q1, [x11]
|
|
|
|
add x11, x11, x10
|
|
|
|
ldp q17, q2, [x11]
|
|
|
|
add x11, x11, x10
|
|
|
|
ldp q18, q3, [x11]
|
|
|
|
add x11, x11, x10
|
|
|
|
ldp q19, q4, [x11]
|
|
|
|
add x11, x11, x10
|
|
|
|
ldp q20, q5, [x11]
|
|
|
|
add x11, x11, x10
|
|
|
|
ldp q21, q6, [x11]
|
|
|
|
add x11, x11, x10
|
|
|
|
ldp q22, q7, [x11]
|
|
|
|
add x11, x11, x10
|
|
|
|
1:
|
aarch64: hevc: Don't iterate with sp in ff_hevc_put_hevc_qpel_uni_w_hv32/64_8_neon_i8mm
Many of the routines within hevcdsp_epel_neon and hevcdsp_qpel_neon
store temporary buffers on the stack. When consuming it,
many of these functions use the stack pointer as incremental pointer
for reading the data (instead of storing it in another register),
which is rather unusual.
Technically, this is fine as long as the pointer remains properly
aligned.
However in the case of ff_hevc_put_hevc_qpel_uni_w_hv64_8_neon_i8mm,
after incrementing sp when reading data (within each 16 pixel
wide stripe) it would then reset the stack pointer back to a lower
value, for reading the next 16 pixel wide stripe, expecting the
data to remain untouched.
This can't be assumed; data on the stack below the stack pointer
can be clobbered (e.g. by a signal handler). Some OS ABIs
allow for a little margin that won't be touched, aka a red zone,
but not all do. The ones that do, guarantee 16 or 128 bytes, not
9 KB.
Convert this function to use a separate pointer register to
iterate through the data, retaining the stack pointer to point
at the bottom of the data we require to remain untouched.
Signed-off-by: Martin Storsjö <martin@martin.st>
10 months ago
|
|
|
ldp q23, q31, [x11]
|
|
|
|
add x11, x11, x10
|
|
|
|
QPEL_FILTER_H v24, v16, v17, v18, v19, v20, v21, v22, v23
|
|
|
|
QPEL_FILTER_H2 v25, v16, v17, v18, v19, v20, v21, v22, v23
|
|
|
|
QPEL_FILTER_H v26, v1, v2, v3, v4, v5, v6, v7, v31
|
|
|
|
QPEL_FILTER_H2 v27, v1, v2, v3, v4, v5, v6, v7, v31
|
|
|
|
QPEL_UNI_W_HV_16
|
|
|
|
subs w22, w22, #1
|
|
|
|
b.eq 2f
|
|
|
|
|
aarch64: hevc: Don't iterate with sp in ff_hevc_put_hevc_qpel_uni_w_hv32/64_8_neon_i8mm
Many of the routines within hevcdsp_epel_neon and hevcdsp_qpel_neon
store temporary buffers on the stack. When consuming it,
many of these functions use the stack pointer as incremental pointer
for reading the data (instead of storing it in another register),
which is rather unusual.
Technically, this is fine as long as the pointer remains properly
aligned.
However in the case of ff_hevc_put_hevc_qpel_uni_w_hv64_8_neon_i8mm,
after incrementing sp when reading data (within each 16 pixel
wide stripe) it would then reset the stack pointer back to a lower
value, for reading the next 16 pixel wide stripe, expecting the
data to remain untouched.
This can't be assumed; data on the stack below the stack pointer
can be clobbered (e.g. by a signal handler). Some OS ABIs
allow for a little margin that won't be touched, aka a red zone,
but not all do. The ones that do, guarantee 16 or 128 bytes, not
9 KB.
Convert this function to use a separate pointer register to
iterate through the data, retaining the stack pointer to point
at the bottom of the data we require to remain untouched.
Signed-off-by: Martin Storsjö <martin@martin.st>
10 months ago
|
|
|
ldp q16, q1, [x11]
|
|
|
|
add x11, x11, x10
|
|
|
|
QPEL_FILTER_H v24, v17, v18, v19, v20, v21, v22, v23, v16
|
|
|
|
QPEL_FILTER_H2 v25, v17, v18, v19, v20, v21, v22, v23, v16
|
|
|
|
QPEL_FILTER_H v26, v2, v3, v4, v5, v6, v7, v31, v1
|
|
|
|
QPEL_FILTER_H2 v27, v2, v3, v4, v5, v6, v7, v31, v1
|
|
|
|
QPEL_UNI_W_HV_16
|
|
|
|
subs w22, w22, #1
|
|
|
|
b.eq 2f
|
|
|
|
|
aarch64: hevc: Don't iterate with sp in ff_hevc_put_hevc_qpel_uni_w_hv32/64_8_neon_i8mm
Many of the routines within hevcdsp_epel_neon and hevcdsp_qpel_neon
store temporary buffers on the stack. When consuming it,
many of these functions use the stack pointer as incremental pointer
for reading the data (instead of storing it in another register),
which is rather unusual.
Technically, this is fine as long as the pointer remains properly
aligned.
However in the case of ff_hevc_put_hevc_qpel_uni_w_hv64_8_neon_i8mm,
after incrementing sp when reading data (within each 16 pixel
wide stripe) it would then reset the stack pointer back to a lower
value, for reading the next 16 pixel wide stripe, expecting the
data to remain untouched.
This can't be assumed; data on the stack below the stack pointer
can be clobbered (e.g. by a signal handler). Some OS ABIs
allow for a little margin that won't be touched, aka a red zone,
but not all do. The ones that do, guarantee 16 or 128 bytes, not
9 KB.
Convert this function to use a separate pointer register to
iterate through the data, retaining the stack pointer to point
at the bottom of the data we require to remain untouched.
Signed-off-by: Martin Storsjö <martin@martin.st>
10 months ago
|
|
|
ldp q17, q2, [x11]
|
|
|
|
add x11, x11, x10
|
|
|
|
QPEL_FILTER_H v24, v18, v19, v20, v21, v22, v23, v16, v17
|
|
|
|
QPEL_FILTER_H2 v25, v18, v19, v20, v21, v22, v23, v16, v17
|
|
|
|
QPEL_FILTER_H v26, v3, v4, v5, v6, v7, v31, v1, v2
|
|
|
|
QPEL_FILTER_H2 v27, v3, v4, v5, v6, v7, v31, v1, v2
|
|
|
|
QPEL_UNI_W_HV_16
|
|
|
|
subs w22, w22, #1
|
|
|
|
b.eq 2f
|
|
|
|
|
aarch64: hevc: Don't iterate with sp in ff_hevc_put_hevc_qpel_uni_w_hv32/64_8_neon_i8mm
Many of the routines within hevcdsp_epel_neon and hevcdsp_qpel_neon
store temporary buffers on the stack. When consuming it,
many of these functions use the stack pointer as incremental pointer
for reading the data (instead of storing it in another register),
which is rather unusual.
Technically, this is fine as long as the pointer remains properly
aligned.
However in the case of ff_hevc_put_hevc_qpel_uni_w_hv64_8_neon_i8mm,
after incrementing sp when reading data (within each 16 pixel
wide stripe) it would then reset the stack pointer back to a lower
value, for reading the next 16 pixel wide stripe, expecting the
data to remain untouched.
This can't be assumed; data on the stack below the stack pointer
can be clobbered (e.g. by a signal handler). Some OS ABIs
allow for a little margin that won't be touched, aka a red zone,
but not all do. The ones that do, guarantee 16 or 128 bytes, not
9 KB.
Convert this function to use a separate pointer register to
iterate through the data, retaining the stack pointer to point
at the bottom of the data we require to remain untouched.
Signed-off-by: Martin Storsjö <martin@martin.st>
10 months ago
|
|
|
ldp q18, q3, [x11]
|
|
|
|
add x11, x11, x10
|
|
|
|
QPEL_FILTER_H v24, v19, v20, v21, v22, v23, v16, v17, v18
|
|
|
|
QPEL_FILTER_H2 v25, v19, v20, v21, v22, v23, v16, v17, v18
|
|
|
|
QPEL_FILTER_H v26, v4, v5, v6, v7, v31, v1, v2, v3
|
|
|
|
QPEL_FILTER_H2 v27, v4, v5, v6, v7, v31, v1, v2, v3
|
|
|
|
QPEL_UNI_W_HV_16
|
|
|
|
subs w22, w22, #1
|
|
|
|
b.eq 2f
|
|
|
|
|
aarch64: hevc: Don't iterate with sp in ff_hevc_put_hevc_qpel_uni_w_hv32/64_8_neon_i8mm
Many of the routines within hevcdsp_epel_neon and hevcdsp_qpel_neon
store temporary buffers on the stack. When consuming it,
many of these functions use the stack pointer as incremental pointer
for reading the data (instead of storing it in another register),
which is rather unusual.
Technically, this is fine as long as the pointer remains properly
aligned.
However in the case of ff_hevc_put_hevc_qpel_uni_w_hv64_8_neon_i8mm,
after incrementing sp when reading data (within each 16 pixel
wide stripe) it would then reset the stack pointer back to a lower
value, for reading the next 16 pixel wide stripe, expecting the
data to remain untouched.
This can't be assumed; data on the stack below the stack pointer
can be clobbered (e.g. by a signal handler). Some OS ABIs
allow for a little margin that won't be touched, aka a red zone,
but not all do. The ones that do, guarantee 16 or 128 bytes, not
9 KB.
Convert this function to use a separate pointer register to
iterate through the data, retaining the stack pointer to point
at the bottom of the data we require to remain untouched.
Signed-off-by: Martin Storsjö <martin@martin.st>
10 months ago
|
|
|
ldp q19, q4, [x11]
|
|
|
|
add x11, x11, x10
|
|
|
|
QPEL_FILTER_H v24, v20, v21, v22, v23, v16, v17, v18, v19
|
|
|
|
QPEL_FILTER_H2 v25, v20, v21, v22, v23, v16, v17, v18, v19
|
|
|
|
QPEL_FILTER_H v26, v5, v6, v7, v31, v1, v2, v3, v4
|
|
|
|
QPEL_FILTER_H2 v27, v5, v6, v7, v31, v1, v2, v3, v4
|
|
|
|
QPEL_UNI_W_HV_16
|
|
|
|
subs w22, w22, #1
|
|
|
|
b.eq 2f
|
|
|
|
|
aarch64: hevc: Don't iterate with sp in ff_hevc_put_hevc_qpel_uni_w_hv32/64_8_neon_i8mm
Many of the routines within hevcdsp_epel_neon and hevcdsp_qpel_neon
store temporary buffers on the stack. When consuming it,
many of these functions use the stack pointer as incremental pointer
for reading the data (instead of storing it in another register),
which is rather unusual.
Technically, this is fine as long as the pointer remains properly
aligned.
However in the case of ff_hevc_put_hevc_qpel_uni_w_hv64_8_neon_i8mm,
after incrementing sp when reading data (within each 16 pixel
wide stripe) it would then reset the stack pointer back to a lower
value, for reading the next 16 pixel wide stripe, expecting the
data to remain untouched.
This can't be assumed; data on the stack below the stack pointer
can be clobbered (e.g. by a signal handler). Some OS ABIs
allow for a little margin that won't be touched, aka a red zone,
but not all do. The ones that do, guarantee 16 or 128 bytes, not
9 KB.
Convert this function to use a separate pointer register to
iterate through the data, retaining the stack pointer to point
at the bottom of the data we require to remain untouched.
Signed-off-by: Martin Storsjö <martin@martin.st>
10 months ago
|
|
|
ldp q20, q5, [x11]
|
|
|
|
add x11, x11, x10
|
|
|
|
QPEL_FILTER_H v24, v21, v22, v23, v16, v17, v18, v19, v20
|
|
|
|
QPEL_FILTER_H2 v25, v21, v22, v23, v16, v17, v18, v19, v20
|
|
|
|
QPEL_FILTER_H v26, v6, v7, v31, v1, v2, v3, v4, v5
|
|
|
|
QPEL_FILTER_H2 v27, v6, v7, v31, v1, v2, v3, v4, v5
|
|
|
|
QPEL_UNI_W_HV_16
|
|
|
|
subs w22, w22, #1
|
|
|
|
b.eq 2f
|
|
|
|
|
aarch64: hevc: Don't iterate with sp in ff_hevc_put_hevc_qpel_uni_w_hv32/64_8_neon_i8mm
Many of the routines within hevcdsp_epel_neon and hevcdsp_qpel_neon
store temporary buffers on the stack. When consuming it,
many of these functions use the stack pointer as incremental pointer
for reading the data (instead of storing it in another register),
which is rather unusual.
Technically, this is fine as long as the pointer remains properly
aligned.
However in the case of ff_hevc_put_hevc_qpel_uni_w_hv64_8_neon_i8mm,
after incrementing sp when reading data (within each 16 pixel
wide stripe) it would then reset the stack pointer back to a lower
value, for reading the next 16 pixel wide stripe, expecting the
data to remain untouched.
This can't be assumed; data on the stack below the stack pointer
can be clobbered (e.g. by a signal handler). Some OS ABIs
allow for a little margin that won't be touched, aka a red zone,
but not all do. The ones that do, guarantee 16 or 128 bytes, not
9 KB.
Convert this function to use a separate pointer register to
iterate through the data, retaining the stack pointer to point
at the bottom of the data we require to remain untouched.
Signed-off-by: Martin Storsjö <martin@martin.st>
10 months ago
|
|
|
ldp q21, q6, [x11]
|
|
|
|
add x11, x11, x10
|
|
|
|
QPEL_FILTER_H v24, v22, v23, v16, v17, v18, v19, v20, v21
|
|
|
|
QPEL_FILTER_H2 v25, v22, v23, v16, v17, v18, v19, v20, v21
|
|
|
|
QPEL_FILTER_H v26, v7, v31, v1, v2, v3, v4, v5, v6
|
|
|
|
QPEL_FILTER_H2 v27, v7, v31, v1, v2, v3, v4, v5, v6
|
|
|
|
QPEL_UNI_W_HV_16
|
|
|
|
subs w22, w22, #1
|
|
|
|
b.eq 2f
|
|
|
|
|
aarch64: hevc: Don't iterate with sp in ff_hevc_put_hevc_qpel_uni_w_hv32/64_8_neon_i8mm
Many of the routines within hevcdsp_epel_neon and hevcdsp_qpel_neon
store temporary buffers on the stack. When consuming it,
many of these functions use the stack pointer as incremental pointer
for reading the data (instead of storing it in another register),
which is rather unusual.
Technically, this is fine as long as the pointer remains properly
aligned.
However in the case of ff_hevc_put_hevc_qpel_uni_w_hv64_8_neon_i8mm,
after incrementing sp when reading data (within each 16 pixel
wide stripe) it would then reset the stack pointer back to a lower
value, for reading the next 16 pixel wide stripe, expecting the
data to remain untouched.
This can't be assumed; data on the stack below the stack pointer
can be clobbered (e.g. by a signal handler). Some OS ABIs
allow for a little margin that won't be touched, aka a red zone,
but not all do. The ones that do, guarantee 16 or 128 bytes, not
9 KB.
Convert this function to use a separate pointer register to
iterate through the data, retaining the stack pointer to point
at the bottom of the data we require to remain untouched.
Signed-off-by: Martin Storsjö <martin@martin.st>
10 months ago
|
|
|
ldp q22, q7, [x11]
|
|
|
|
add x11, x11, x10
|
|
|
|
QPEL_FILTER_H v24, v23, v16, v17, v18, v19, v20, v21, v22
|
|
|
|
QPEL_FILTER_H2 v25, v23, v16, v17, v18, v19, v20, v21, v22
|
|
|
|
QPEL_FILTER_H v26, v31, v1, v2, v3, v4, v5, v6, v7
|
|
|
|
QPEL_FILTER_H2 v27, v31, v1, v2, v3, v4, v5, v6, v7
|
|
|
|
QPEL_UNI_W_HV_16
|
|
|
|
subs w22, w22, #1
|
|
|
|
b.hi 1b
|
|
|
|
2:
|
|
|
|
subs w27, w27, #16
|
aarch64: hevc: Don't iterate with sp in ff_hevc_put_hevc_qpel_uni_w_hv32/64_8_neon_i8mm
Many of the routines within hevcdsp_epel_neon and hevcdsp_qpel_neon
store temporary buffers on the stack. When consuming it,
many of these functions use the stack pointer as incremental pointer
for reading the data (instead of storing it in another register),
which is rather unusual.
Technically, this is fine as long as the pointer remains properly
aligned.
However in the case of ff_hevc_put_hevc_qpel_uni_w_hv64_8_neon_i8mm,
after incrementing sp when reading data (within each 16 pixel
wide stripe) it would then reset the stack pointer back to a lower
value, for reading the next 16 pixel wide stripe, expecting the
data to remain untouched.
This can't be assumed; data on the stack below the stack pointer
can be clobbered (e.g. by a signal handler). Some OS ABIs
allow for a little margin that won't be touched, aka a red zone,
but not all do. The ones that do, guarantee 16 or 128 bytes, not
9 KB.
Convert this function to use a separate pointer register to
iterate through the data, retaining the stack pointer to point
at the bottom of the data we require to remain untouched.
Signed-off-by: Martin Storsjö <martin@martin.st>
10 months ago
|
|
|
add x11, x14, #32
|
|
|
|
add x20, x13, #16
|
|
|
|
mov w22, w12
|
aarch64: hevc: Don't iterate with sp in ff_hevc_put_hevc_qpel_uni_w_hv32/64_8_neon_i8mm
Many of the routines within hevcdsp_epel_neon and hevcdsp_qpel_neon
store temporary buffers on the stack. When consuming it,
many of these functions use the stack pointer as incremental pointer
for reading the data (instead of storing it in another register),
which is rather unusual.
Technically, this is fine as long as the pointer remains properly
aligned.
However in the case of ff_hevc_put_hevc_qpel_uni_w_hv64_8_neon_i8mm,
after incrementing sp when reading data (within each 16 pixel
wide stripe) it would then reset the stack pointer back to a lower
value, for reading the next 16 pixel wide stripe, expecting the
data to remain untouched.
This can't be assumed; data on the stack below the stack pointer
can be clobbered (e.g. by a signal handler). Some OS ABIs
allow for a little margin that won't be touched, aka a red zone,
but not all do. The ones that do, guarantee 16 or 128 bytes, not
9 KB.
Convert this function to use a separate pointer register to
iterate through the data, retaining the stack pointer to point
at the bottom of the data we require to remain untouched.
Signed-off-by: Martin Storsjö <martin@martin.st>
10 months ago
|
|
|
mov x14, x11
|
|
|
|
mov x13, x20
|
|
|
|
b.hi 3b
|
|
|
|
QPEL_UNI_W_HV_END
|
|
|
|
ret
|
|
|
|
endfunc
|
|
|
|
|
|
|
|
function ff_hevc_put_hevc_qpel_bi_hv4_8_neon_i8mm, export=1
|
|
|
|
add w10, w5, #7
|
|
|
|
lsl x10, x10, #7
|
|
|
|
sub sp, sp, x10 // tmp_array
|
|
|
|
stp x7, x30, [sp, #-48]!
|
|
|
|
stp x4, x5, [sp, #16]
|
|
|
|
stp x0, x1, [sp, #32]
|
|
|
|
sub x1, x2, x3, lsl #1
|
|
|
|
sub x1, x1, x3
|
|
|
|
add x0, sp, #48
|
|
|
|
mov x2, x3
|
|
|
|
add w3, w5, #7
|
|
|
|
mov x4, x6
|
|
|
|
bl X(ff_hevc_put_hevc_qpel_h4_8_neon_i8mm)
|
|
|
|
ldp x4, x5, [sp, #16]
|
|
|
|
ldp x0, x1, [sp, #32]
|
|
|
|
ldp x7, x30, [sp], #48
|
|
|
|
mov x9, #(MAX_PB_SIZE * 2)
|
|
|
|
load_qpel_filterh x7, x6
|
|
|
|
ld1 {v16.4h}, [sp], x9
|
|
|
|
ld1 {v17.4h}, [sp], x9
|
|
|
|
ld1 {v18.4h}, [sp], x9
|
|
|
|
ld1 {v19.4h}, [sp], x9
|
|
|
|
ld1 {v20.4h}, [sp], x9
|
|
|
|
ld1 {v21.4h}, [sp], x9
|
|
|
|
ld1 {v22.4h}, [sp], x9
|
|
|
|
.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
|
|
|
|
ld1 {\tmp\().4h}, [sp], x9
|
|
|
|
calc_qpelh v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sshr
|
|
|
|
ld1 {v5.4h}, [x4], x9 // src2
|
|
|
|
saddw v1.4s, v1.4s, v5.4h
|
|
|
|
rshrn v1.4h, v1.4s, #7
|
|
|
|
sqxtun v1.8b, v1.8h
|
|
|
|
subs w5, w5, #1
|
|
|
|
st1 {v1.s}[0], [x0], x1
|
|
|
|
.endm
|
|
|
|
1: calc_all
|
|
|
|
.purgem calc
|
|
|
|
2: ret
|
|
|
|
endfunc
|
|
|
|
|
|
|
|
function ff_hevc_put_hevc_qpel_bi_hv6_8_neon_i8mm, export=1
|
|
|
|
add w10, w5, #7
|
|
|
|
lsl x10, x10, #7
|
|
|
|
sub sp, sp, x10 // tmp_array
|
|
|
|
stp x7, x30, [sp, #-48]!
|
|
|
|
stp x4, x5, [sp, #16]
|
|
|
|
stp x0, x1, [sp, #32]
|
|
|
|
sub x1, x2, x3, lsl #1
|
|
|
|
sub x1, x1, x3
|
|
|
|
add x0, sp, #48
|
|
|
|
mov x2, x3
|
|
|
|
add x3, x5, #7
|
|
|
|
mov x4, x6
|
|
|
|
bl X(ff_hevc_put_hevc_qpel_h6_8_neon_i8mm)
|
|
|
|
ldp x4, x5, [sp, #16]
|
|
|
|
ldp x0, x1, [sp, #32]
|
|
|
|
ldp x7, x30, [sp], #48
|
|
|
|
mov x9, #(MAX_PB_SIZE * 2)
|
|
|
|
load_qpel_filterh x7, x6
|
|
|
|
sub x1, x1, #4
|
|
|
|
ld1 {v16.8h}, [sp], x9
|
|
|
|
ld1 {v17.8h}, [sp], x9
|
|
|
|
ld1 {v18.8h}, [sp], x9
|
|
|
|
ld1 {v19.8h}, [sp], x9
|
|
|
|
ld1 {v20.8h}, [sp], x9
|
|
|
|
ld1 {v21.8h}, [sp], x9
|
|
|
|
ld1 {v22.8h}, [sp], x9
|
|
|
|
.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
|
|
|
|
ld1 {\tmp\().8h}, [sp], x9
|
|
|
|
calc_qpelh v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sshr
|
|
|
|
calc_qpelh2 v2, v2, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sshr
|
|
|
|
ld1 {v5.8h}, [x4], x9 // src2
|
|
|
|
saddw v1.4s, v1.4s, v5.4h
|
|
|
|
saddw2 v2.4s, v2.4s, v5.8h
|
|
|
|
rshrn v1.4h, v1.4s, #7
|
|
|
|
rshrn2 v1.8h, v2.4s, #7
|
|
|
|
sqxtun v1.8b, v1.8h
|
|
|
|
st1 {v1.s}[0], [x0], #4
|
|
|
|
subs w5, w5, #1
|
|
|
|
st1 {v1.h}[2], [x0], x1
|
|
|
|
.endm
|
|
|
|
1: calc_all
|
|
|
|
.purgem calc
|
|
|
|
2: ret
|
|
|
|
endfunc
|
|
|
|
|
|
|
|
function ff_hevc_put_hevc_qpel_bi_hv8_8_neon_i8mm, export=1
|
|
|
|
add w10, w5, #7
|
|
|
|
lsl x10, x10, #7
|
|
|
|
sub sp, sp, x10 // tmp_array
|
|
|
|
stp x7, x30, [sp, #-48]!
|
|
|
|
stp x4, x5, [sp, #16]
|
|
|
|
stp x0, x1, [sp, #32]
|
|
|
|
sub x1, x2, x3, lsl #1
|
|
|
|
sub x1, x1, x3
|
|
|
|
add x0, sp, #48
|
|
|
|
mov x2, x3
|
|
|
|
add x3, x5, #7
|
|
|
|
mov x4, x6
|
|
|
|
bl X(ff_hevc_put_hevc_qpel_h8_8_neon_i8mm)
|
|
|
|
ldp x4, x5, [sp, #16]
|
|
|
|
ldp x0, x1, [sp, #32]
|
|
|
|
ldp x7, x30, [sp], #48
|
|
|
|
mov x9, #(MAX_PB_SIZE * 2)
|
|
|
|
load_qpel_filterh x7, x6
|
|
|
|
ld1 {v16.8h}, [sp], x9
|
|
|
|
ld1 {v17.8h}, [sp], x9
|
|
|
|
ld1 {v18.8h}, [sp], x9
|
|
|
|
ld1 {v19.8h}, [sp], x9
|
|
|
|
ld1 {v20.8h}, [sp], x9
|
|
|
|
ld1 {v21.8h}, [sp], x9
|
|
|
|
ld1 {v22.8h}, [sp], x9
|
|
|
|
.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
|
|
|
|
ld1 {\tmp\().8h}, [sp], x9
|
|
|
|
calc_qpelh v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sshr
|
|
|
|
calc_qpelh2 v2, v2, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sshr
|
|
|
|
ld1 {v5.8h}, [x4], x9 // src2
|
|
|
|
saddw v1.4s, v1.4s, v5.4h
|
|
|
|
saddw2 v2.4s, v2.4s, v5.8h
|
|
|
|
rshrn v1.4h, v1.4s, #7
|
|
|
|
rshrn2 v1.8h, v2.4s, #7
|
|
|
|
sqxtun v1.8b, v1.8h
|
|
|
|
subs w5, w5, #1
|
|
|
|
st1 {v1.8b}, [x0], x1
|
|
|
|
.endm
|
|
|
|
1: calc_all
|
|
|
|
.purgem calc
|
|
|
|
2: ret
|
|
|
|
endfunc
|
|
|
|
|
|
|
|
function ff_hevc_put_hevc_qpel_bi_hv12_8_neon_i8mm, export=1
|
|
|
|
stp x6, x7, [sp, #-80]!
|
|
|
|
stp x4, x5, [sp, #16]
|
|
|
|
stp x2, x3, [sp, #32]
|
|
|
|
stp x0, x1, [sp, #48]
|
|
|
|
str x30, [sp, #64]
|
|
|
|
bl X(ff_hevc_put_hevc_qpel_bi_hv8_8_neon_i8mm)
|
|
|
|
ldp x4, x5, [sp, #16]
|
|
|
|
ldp x2, x3, [sp, #32]
|
|
|
|
ldp x0, x1, [sp, #48]
|
|
|
|
ldp x6, x7, [sp], #64
|
|
|
|
add x4, x4, #16
|
|
|
|
add x2, x2, #8
|
|
|
|
add x0, x0, #8
|
|
|
|
bl X(ff_hevc_put_hevc_qpel_bi_hv4_8_neon_i8mm)
|
|
|
|
ldr x30, [sp], #16
|
|
|
|
ret
|
|
|
|
endfunc
|
|
|
|
|
|
|
|
function ff_hevc_put_hevc_qpel_bi_hv16_8_neon_i8mm, export=1
|
|
|
|
add w10, w5, #7
|
|
|
|
lsl x10, x10, #7
|
|
|
|
sub sp, sp, x10 // tmp_array
|
|
|
|
stp x7, x30, [sp, #-48]!
|
|
|
|
stp x4, x5, [sp, #16]
|
|
|
|
stp x0, x1, [sp, #32]
|
|
|
|
add x0, sp, #48
|
|
|
|
sub x1, x2, x3, lsl #1
|
|
|
|
sub x1, x1, x3
|
|
|
|
mov x2, x3
|
|
|
|
add w3, w5, #7
|
|
|
|
mov x4, x6
|
|
|
|
bl X(ff_hevc_put_hevc_qpel_h16_8_neon_i8mm)
|
|
|
|
ldp x4, x5, [sp, #16]
|
|
|
|
ldp x0, x1, [sp, #32]
|
|
|
|
ldp x7, x30, [sp], #48
|
|
|
|
mov x6, #16 // width
|
|
|
|
.Lqpel_bi_hv16_loop:
|
|
|
|
load_qpel_filterh x7, x8
|
|
|
|
mov x9, #(MAX_PB_SIZE * 2)
|
|
|
|
mov x10, x6
|
|
|
|
0: mov x8, sp // src
|
|
|
|
ld1 {v16.8h, v17.8h}, [x8], x9
|
|
|
|
mov w11, w5 // height
|
|
|
|
ld1 {v18.8h, v19.8h}, [x8], x9
|
|
|
|
mov x12, x4 // src2
|
|
|
|
ld1 {v20.8h, v21.8h}, [x8], x9
|
|
|
|
mov x7, x0 // dst
|
|
|
|
ld1 {v22.8h, v23.8h}, [x8], x9
|
|
|
|
ld1 {v24.8h, v25.8h}, [x8], x9
|
|
|
|
ld1 {v26.8h, v27.8h}, [x8], x9
|
|
|
|
ld1 {v28.8h, v29.8h}, [x8], x9
|
|
|
|
.macro calc tmp0, tmp1, src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11, src12, src13, src14, src15
|
|
|
|
ld1 {\tmp0\().8h, \tmp1\().8h}, [x8], x9
|
|
|
|
calc_qpelh v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sshr
|
|
|
|
calc_qpelh2 v2, v2, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sshr
|
|
|
|
calc_qpelh v3, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15, sshr
|
|
|
|
calc_qpelh2 v4, v4, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15, sshr
|
|
|
|
ld1 {v5.8h, v6.8h}, [x12], x9 // src2
|
|
|
|
saddw v1.4s, v1.4s, v5.4h
|
|
|
|
saddw2 v2.4s, v2.4s, v5.8h
|
|
|
|
saddw v3.4s, v3.4s, v6.4h
|
|
|
|
saddw2 v4.4s, v4.4s, v6.8h
|
|
|
|
rshrn v1.4h, v1.4s, #7
|
|
|
|
rshrn2 v1.8h, v2.4s, #7
|
|
|
|
rshrn v2.4h, v3.4s, #7
|
|
|
|
rshrn2 v2.8h, v4.4s, #7
|
|
|
|
sqxtun v1.8b, v1.8h
|
|
|
|
sqxtun2 v1.16b, v2.8h
|
|
|
|
subs x11, x11, #1
|
|
|
|
st1 {v1.16b}, [x7], x1
|
|
|
|
.endm
|
|
|
|
1: calc_all2
|
|
|
|
.purgem calc
|
|
|
|
2: add x0, x0, #16
|
|
|
|
add sp, sp, #32
|
|
|
|
subs x10, x10, #16
|
|
|
|
add x4, x4, #32
|
|
|
|
b.ne 0b
|
|
|
|
add w10, w5, #7
|
|
|
|
lsl x10, x10, #7
|
|
|
|
sub x10, x10, x6, lsl #1 // part of first line
|
|
|
|
add sp, sp, x10 // tmp_array without first line
|
|
|
|
ret
|
|
|
|
endfunc
|
|
|
|
|
|
|
|
function ff_hevc_put_hevc_qpel_bi_hv24_8_neon_i8mm, export=1
|
|
|
|
stp x6, x7, [sp, #-80]!
|
|
|
|
stp x4, x5, [sp, #16]
|
|
|
|
stp x2, x3, [sp, #32]
|
|
|
|
stp x0, x1, [sp, #48]
|
|
|
|
str x30, [sp, #64]
|
|
|
|
bl X(ff_hevc_put_hevc_qpel_bi_hv16_8_neon_i8mm)
|
|
|
|
ldp x4, x5, [sp, #16]
|
|
|
|
ldp x2, x3, [sp, #32]
|
|
|
|
ldp x0, x1, [sp, #48]
|
|
|
|
ldp x6, x7, [sp], #64
|
|
|
|
add x4, x4, #32
|
|
|
|
add x2, x2, #16
|
|
|
|
add x0, x0, #16
|
|
|
|
bl X(ff_hevc_put_hevc_qpel_bi_hv8_8_neon_i8mm)
|
|
|
|
ldr x30, [sp], #16
|
|
|
|
ret
|
|
|
|
endfunc
|
|
|
|
|
|
|
|
function ff_hevc_put_hevc_qpel_bi_hv32_8_neon_i8mm, export=1
|
|
|
|
add w10, w5, #7
|
|
|
|
lsl x10, x10, #7
|
|
|
|
sub sp, sp, x10 // tmp_array
|
|
|
|
stp x7, x30, [sp, #-48]!
|
|
|
|
stp x4, x5, [sp, #16]
|
|
|
|
stp x0, x1, [sp, #32]
|
|
|
|
add x0, sp, #48
|
|
|
|
sub x1, x2, x3, lsl #1
|
|
|
|
mov x2, x3
|
|
|
|
sub x1, x1, x3
|
|
|
|
add w3, w5, #7
|
|
|
|
mov x4, x6
|
|
|
|
bl X(ff_hevc_put_hevc_qpel_h32_8_neon_i8mm)
|
|
|
|
ldp x4, x5, [sp, #16]
|
|
|
|
ldp x0, x1, [sp, #32]
|
|
|
|
ldp x7, x30, [sp], #48
|
|
|
|
mov x6, #32 // width
|
|
|
|
b .Lqpel_bi_hv16_loop
|
|
|
|
endfunc
|
|
|
|
|
|
|
|
function ff_hevc_put_hevc_qpel_bi_hv48_8_neon_i8mm, export=1
|
|
|
|
add w10, w5, #7
|
|
|
|
lsl x10, x10, #7
|
|
|
|
sub sp, sp, x10 // tmp_array
|
|
|
|
stp x7, x30, [sp, #-48]!
|
|
|
|
stp x4, x5, [sp, #16]
|
|
|
|
stp x0, x1, [sp, #32]
|
|
|
|
add x0, sp, #48
|
|
|
|
sub x1, x2, x3, lsl #1
|
|
|
|
mov x2, x3
|
|
|
|
sub x1, x1, x3
|
|
|
|
add w3, w5, #7
|
|
|
|
mov x4, x6
|
|
|
|
bl X(ff_hevc_put_hevc_qpel_h48_8_neon_i8mm)
|
|
|
|
ldp x4, x5, [sp, #16]
|
|
|
|
ldp x0, x1, [sp, #32]
|
|
|
|
ldp x7, x30, [sp], #48
|
|
|
|
mov x6, #48 // width
|
|
|
|
b .Lqpel_bi_hv16_loop
|
|
|
|
endfunc
|
|
|
|
|
|
|
|
function ff_hevc_put_hevc_qpel_bi_hv64_8_neon_i8mm, export=1
|
|
|
|
add w10, w5, #7
|
|
|
|
lsl x10, x10, #7
|
|
|
|
sub sp, sp, x10 // tmp_array
|
|
|
|
stp x7, x30, [sp, #-48]!
|
|
|
|
stp x4, x5, [sp, #16]
|
|
|
|
stp x0, x1, [sp, #32]
|
|
|
|
add x0, sp, #48
|
|
|
|
sub x1, x2, x3, lsl #1
|
|
|
|
mov x2, x3
|
|
|
|
sub x1, x1, x3
|
|
|
|
add w3, w5, #7
|
|
|
|
mov x4, x6
|
|
|
|
bl X(ff_hevc_put_hevc_qpel_h64_8_neon_i8mm)
|
|
|
|
ldp x4, x5, [sp, #16]
|
|
|
|
ldp x0, x1, [sp, #32]
|
|
|
|
ldp x7, x30, [sp], #48
|
|
|
|
mov x6, #64 // width
|
|
|
|
b .Lqpel_bi_hv16_loop
|
|
|
|
endfunc
|
|
|
|
|
|
|
|
DISABLE_I8MM
|
|
|
|
#endif // HAVE_I8MM
|