mirror of https://github.com/FFmpeg/FFmpeg.git
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
484 lines
16 KiB
484 lines
16 KiB
/* -*-arm64-*- |
|
* vim: syntax=arm64asm |
|
* |
|
* Copyright (c) 2022 J. Dekker <jdek@itanimul.li> |
|
* |
|
* This file is part of FFmpeg. |
|
* |
|
* FFmpeg is free software; you can redistribute it and/or |
|
* modify it under the terms of the GNU Lesser General Public |
|
* License as published by the Free Software Foundation; either |
|
* version 2.1 of the License, or (at your option) any later version. |
|
* |
|
* FFmpeg is distributed in the hope that it will be useful, |
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
|
* Lesser General Public License for more details. |
|
* |
|
* You should have received a copy of the GNU Lesser General Public |
|
* License along with FFmpeg; if not, write to the Free Software |
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
|
*/ |
|
|
|
#include "libavutil/aarch64/asm.S" |
|
#define MAX_PB_SIZE 64 |
|
|
|
const qpel_filters, align=4 |
|
.byte 0, 0, 0, 0, 0, 0, 0, 0 |
|
.byte -1, 4,-10, 58, 17, -5, 1, 0 |
|
.byte -1, 4,-11, 40, 40,-11, 4, -1 |
|
.byte 0, 1, -5, 17, 58,-10, 4, -1 |
|
endconst |
|
|
|
.macro load_filter m |
|
movrel x15, qpel_filters |
|
add x15, x15, \m, lsl #3 |
|
ld1 {v0.8b}, [x15] |
|
sxtl v0.8h, v0.8b |
|
.endm |
|
|
|
.macro put_hevc type |
|
.ifc \type, qpel |
|
// void put_hevc_qpel_h(int16_t *dst, |
|
// uint8_t *_src, ptrdiff_t _srcstride, |
|
// int height, intptr_t mx, intptr_t my, int width) |
|
dst .req x0 |
|
dststride .req x7 |
|
src .req x1 |
|
srcstride .req x2 |
|
height .req x3 |
|
heightw .req w3 |
|
mx .req x4 |
|
width .req w6 |
|
.endif |
|
.ifc \type, qpel_uni |
|
// void put_hevc_qpel_uni_h(uint8_t *_dst, ptrdiff_t _dststride, |
|
// uint8_t *_src, ptrdiff_t _srcstride, |
|
// int height, intptr_t mx, intptr_t my, int width) |
|
dst .req x0 |
|
dststride .req x1 |
|
src .req x2 |
|
srcstride .req x3 |
|
height .req x4 |
|
heightw .req w4 |
|
mx .req x5 |
|
width .req w7 |
|
.endif |
|
.ifc \type, qpel_bi |
|
// void put_hevc_qpel_bi_h(uint8_t *_dst, ptrdiff_t _dststride, |
|
// uint8_t *_src, ptrdiff_t _srcstride, |
|
// int16_t *src2, int height, intptr_t mx, |
|
// intptr_t my, int width) |
|
dst .req x0 |
|
dststride .req x1 |
|
src .req x2 |
|
srcstride .req x3 |
|
height .req x5 |
|
heightw .req w5 |
|
mx .req x6 |
|
width .req w8 |
|
.endif |
|
|
|
.ifc \type, qpel |
|
function ff_hevc_put_hevc_h4_8_neon, export=0 |
|
uxtl v16.8h, v16.8b |
|
uxtl v17.8h, v17.8b |
|
uxtl v18.8h, v18.8b |
|
uxtl v19.8h, v19.8b |
|
|
|
mul v23.4h, v16.4h, v0.h[0] |
|
mul v24.4h, v18.4h, v0.h[0] |
|
|
|
.irpc i, 1234567 |
|
ext v20.16b, v16.16b, v17.16b, #(2*\i) |
|
ext v21.16b, v18.16b, v19.16b, #(2*\i) |
|
mla v23.4h, v20.4h, v0.h[\i] |
|
mla v24.4h, v21.4h, v0.h[\i] |
|
.endr |
|
ret |
|
endfunc |
|
.endif |
|
|
|
function ff_hevc_put_hevc_\type\()_h4_8_neon, export=1 |
|
load_filter mx |
|
.ifc \type, qpel_bi |
|
mov x16, #(MAX_PB_SIZE << 2) // src2bstridel |
|
add x15, x4, #(MAX_PB_SIZE << 1) // src2b |
|
.endif |
|
sub src, src, #3 |
|
mov mx, x30 |
|
.ifc \type, qpel |
|
mov dststride, #(MAX_PB_SIZE << 1) |
|
lsl x13, srcstride, #1 // srcstridel |
|
mov x14, #(MAX_PB_SIZE << 2) |
|
.else |
|
lsl x14, dststride, #1 // dststridel |
|
lsl x13, srcstride, #1 // srcstridel |
|
.endif |
|
add x10, dst, dststride // dstb |
|
add x12, src, srcstride // srcb |
|
0: ld1 {v16.8b, v17.8b}, [src], x13 |
|
ld1 {v18.8b, v19.8b}, [x12], x13 |
|
.ifc \type, qpel_bi |
|
ld1 {v25.8h}, [ x4], x16 |
|
ld1 {v26.8h}, [x15], x16 |
|
.endif |
|
|
|
bl ff_hevc_put_hevc_h4_8_neon |
|
subs heightw, heightw, #2 |
|
|
|
.ifc \type, qpel |
|
st1 {v23.4h}, [dst], x14 |
|
st1 {v24.4h}, [x10], x14 |
|
.else |
|
.ifc \type, qpel_bi |
|
sqadd v23.4h, v23.4h, v25.4h |
|
sqadd v24.4h, v24.4h, v26.4h |
|
sqrshrun v23.8b, v23.8h, #7 |
|
sqrshrun v24.8b, v24.8h, #7 |
|
.else |
|
sqrshrun v23.8b, v23.8h, #6 |
|
sqrshrun v24.8b, v24.8h, #6 |
|
.endif |
|
st1 {v23.s}[0], [dst], x14 |
|
st1 {v24.s}[0], [x10], x14 |
|
.endif |
|
b.gt 0b // double line |
|
ret mx |
|
endfunc |
|
|
|
.ifc \type, qpel |
|
function ff_hevc_put_hevc_h8_8_neon, export=0 |
|
uxtl v16.8h, v16.8b |
|
uxtl v17.8h, v17.8b |
|
uxtl v18.8h, v18.8b |
|
uxtl v19.8h, v19.8b |
|
|
|
mul v23.8h, v16.8h, v0.h[0] |
|
mul v24.8h, v18.8h, v0.h[0] |
|
|
|
.irpc i, 1234567 |
|
ext v20.16b, v16.16b, v17.16b, #(2*\i) |
|
ext v21.16b, v18.16b, v19.16b, #(2*\i) |
|
mla v23.8h, v20.8h, v0.h[\i] |
|
mla v24.8h, v21.8h, v0.h[\i] |
|
.endr |
|
ret |
|
endfunc |
|
.endif |
|
|
|
function ff_hevc_put_hevc_\type\()_h6_8_neon, export=1 |
|
load_filter mx |
|
.ifc \type, qpel_bi |
|
mov x16, #(MAX_PB_SIZE << 2) // src2bstridel |
|
add x15, x4, #(MAX_PB_SIZE << 1) // src2b |
|
.endif |
|
sub src, src, #3 |
|
mov mx, x30 |
|
.ifc \type, qpel |
|
mov dststride, #(MAX_PB_SIZE << 1) |
|
lsl x13, srcstride, #1 // srcstridel |
|
mov x14, #((MAX_PB_SIZE << 2) - 8) |
|
.else |
|
lsl x14, dststride, #1 // dststridel |
|
lsl x13, srcstride, #1 // srcstridel |
|
sub x14, x14, #4 |
|
.endif |
|
add x10, dst, dststride // dstb |
|
add x12, src, srcstride // srcb |
|
0: ld1 {v16.8b, v17.8b}, [src], x13 |
|
ld1 {v18.8b, v19.8b}, [x12], x13 |
|
.ifc \type, qpel_bi |
|
ld1 {v25.8h}, [ x4], x16 |
|
ld1 {v26.8h}, [x15], x16 |
|
.endif |
|
|
|
bl ff_hevc_put_hevc_h8_8_neon |
|
subs heightw, heightw, #2 |
|
|
|
.ifc \type, qpel |
|
st1 {v23.4h}, [dst], #8 |
|
st1 {v24.4h}, [x10], #8 |
|
st1 {v23.s}[2], [dst], x14 |
|
st1 {v24.s}[2], [x10], x14 |
|
.else |
|
.ifc \type, qpel_bi |
|
sqadd v23.8h, v23.8h, v25.8h |
|
sqadd v24.8h, v24.8h, v26.8h |
|
sqrshrun v23.8b, v23.8h, #7 |
|
sqrshrun v24.8b, v24.8h, #7 |
|
.else |
|
sqrshrun v23.8b, v23.8h, #6 |
|
sqrshrun v24.8b, v24.8h, #6 |
|
.endif |
|
st1 {v23.s}[0], [dst], #4 |
|
st1 {v24.s}[0], [x10], #4 |
|
st1 {v23.h}[2], [dst], x14 |
|
st1 {v24.h}[2], [x10], x14 |
|
.endif |
|
b.gt 0b // double line |
|
ret mx |
|
endfunc |
|
|
|
function ff_hevc_put_hevc_\type\()_h8_8_neon, export=1 |
|
load_filter mx |
|
.ifc \type, qpel_bi |
|
mov x16, #(MAX_PB_SIZE << 2) // src2bstridel |
|
add x15, x4, #(MAX_PB_SIZE << 1) // src2b |
|
.endif |
|
sub src, src, #3 |
|
mov mx, x30 |
|
.ifc \type, qpel |
|
mov dststride, #(MAX_PB_SIZE << 1) |
|
lsl x13, srcstride, #1 // srcstridel |
|
mov x14, #(MAX_PB_SIZE << 2) |
|
.else |
|
lsl x14, dststride, #1 // dststridel |
|
lsl x13, srcstride, #1 // srcstridel |
|
.endif |
|
add x10, dst, dststride // dstb |
|
add x12, src, srcstride // srcb |
|
0: ld1 {v16.8b, v17.8b}, [src], x13 |
|
ld1 {v18.8b, v19.8b}, [x12], x13 |
|
.ifc \type, qpel_bi |
|
ld1 {v25.8h}, [ x4], x16 |
|
ld1 {v26.8h}, [x15], x16 |
|
.endif |
|
|
|
bl ff_hevc_put_hevc_h8_8_neon |
|
subs heightw, heightw, #2 |
|
|
|
.ifc \type, qpel |
|
st1 {v23.8h}, [dst], x14 |
|
st1 {v24.8h}, [x10], x14 |
|
.else |
|
.ifc \type, qpel_bi |
|
sqadd v23.8h, v23.8h, v25.8h |
|
sqadd v24.8h, v24.8h, v26.8h |
|
sqrshrun v23.8b, v23.8h, #7 |
|
sqrshrun v24.8b, v24.8h, #7 |
|
.else |
|
sqrshrun v23.8b, v23.8h, #6 |
|
sqrshrun v24.8b, v24.8h, #6 |
|
.endif |
|
st1 {v23.8b}, [dst], x14 |
|
st1 {v24.8b}, [x10], x14 |
|
.endif |
|
b.gt 0b // double line |
|
ret mx |
|
endfunc |
|
|
|
.ifc \type, qpel |
|
function ff_hevc_put_hevc_h16_8_neon, export=0 |
|
uxtl v16.8h, v16.8b |
|
uxtl v17.8h, v17.8b |
|
uxtl v18.8h, v18.8b |
|
|
|
uxtl v19.8h, v19.8b |
|
uxtl v20.8h, v20.8b |
|
uxtl v21.8h, v21.8b |
|
|
|
mul v26.8h, v16.8h, v0.h[0] |
|
mul v27.8h, v17.8h, v0.h[0] |
|
mul v28.8h, v19.8h, v0.h[0] |
|
mul v29.8h, v20.8h, v0.h[0] |
|
.irpc i, 1234567 |
|
ext v22.16b, v16.16b, v17.16b, #(2*\i) |
|
ext v23.16b, v17.16b, v18.16b, #(2*\i) |
|
|
|
ext v24.16b, v19.16b, v20.16b, #(2*\i) |
|
ext v25.16b, v20.16b, v21.16b, #(2*\i) |
|
|
|
mla v26.8h, v22.8h, v0.h[\i] |
|
mla v27.8h, v23.8h, v0.h[\i] |
|
|
|
mla v28.8h, v24.8h, v0.h[\i] |
|
mla v29.8h, v25.8h, v0.h[\i] |
|
.endr |
|
subs x9, x9, #2 |
|
ret |
|
endfunc |
|
.endif |
|
|
|
function ff_hevc_put_hevc_\type\()_h12_8_neon, export=1 |
|
load_filter mx |
|
sxtw height, heightw |
|
.ifc \type, qpel_bi |
|
ldrh w8, [sp] // width |
|
mov x16, #(MAX_PB_SIZE << 2) // src2bstridel |
|
lsl x17, height, #7 // src2b reset (height * (MAX_PB_SIZE << 1)) |
|
add x15, x4, #(MAX_PB_SIZE << 1) // src2b |
|
.endif |
|
sub src, src, #3 |
|
mov mx, x30 |
|
.ifc \type, qpel |
|
mov dststride, #(MAX_PB_SIZE << 1) |
|
lsl x13, srcstride, #1 // srcstridel |
|
mov x14, #((MAX_PB_SIZE << 2) - 16) |
|
.else |
|
lsl x14, dststride, #1 // dststridel |
|
lsl x13, srcstride, #1 // srcstridel |
|
sub x14, x14, #8 |
|
.endif |
|
add x10, dst, dststride // dstb |
|
add x12, src, srcstride // srcb |
|
0: mov x9, height |
|
1: ld1 {v16.8b-v18.8b}, [src], x13 |
|
ld1 {v19.8b-v21.8b}, [x12], x13 |
|
|
|
bl ff_hevc_put_hevc_h16_8_neon |
|
|
|
.ifc \type, qpel |
|
st1 {v26.8h}, [dst], #16 |
|
st1 {v28.8h}, [x10], #16 |
|
st1 {v27.4h}, [dst], x14 |
|
st1 {v29.4h}, [x10], x14 |
|
.else |
|
.ifc \type, qpel_bi |
|
ld1 {v16.8h, v17.8h}, [ x4], x16 |
|
ld1 {v18.8h, v19.8h}, [x15], x16 |
|
sqadd v26.8h, v26.8h, v16.8h |
|
sqadd v27.8h, v27.8h, v17.8h |
|
sqadd v28.8h, v28.8h, v18.8h |
|
sqadd v29.8h, v29.8h, v19.8h |
|
sqrshrun v26.8b, v26.8h, #7 |
|
sqrshrun v27.8b, v27.8h, #7 |
|
sqrshrun v28.8b, v28.8h, #7 |
|
sqrshrun v29.8b, v29.8h, #7 |
|
.else |
|
sqrshrun v26.8b, v26.8h, #6 |
|
sqrshrun v27.8b, v27.8h, #6 |
|
sqrshrun v28.8b, v28.8h, #6 |
|
sqrshrun v29.8b, v29.8h, #6 |
|
.endif |
|
st1 {v26.8b}, [dst], #8 |
|
st1 {v28.8b}, [x10], #8 |
|
st1 {v27.s}[0], [dst], x14 |
|
st1 {v29.s}[0], [x10], x14 |
|
.endif |
|
b.gt 1b // double line |
|
subs width, width, #12 |
|
// reset src |
|
msub src, srcstride, height, src |
|
msub x12, srcstride, height, x12 |
|
// reset dst |
|
msub dst, dststride, height, dst |
|
msub x10, dststride, height, x10 |
|
.ifc \type, qpel_bi |
|
// reset xsrc |
|
sub x4, x4, x17 |
|
sub x15, x15, x17 |
|
add x4, x4, #24 |
|
add x15, x15, #24 |
|
.endif |
|
add src, src, #12 |
|
add x12, x12, #12 |
|
.ifc \type, qpel |
|
add dst, dst, #24 |
|
add x10, x10, #24 |
|
.else |
|
add dst, dst, #12 |
|
add x10, x10, #12 |
|
.endif |
|
b.gt 0b |
|
ret mx |
|
endfunc |
|
|
|
function ff_hevc_put_hevc_\type\()_h16_8_neon, export=1 |
|
load_filter mx |
|
sxtw height, heightw |
|
mov mx, x30 |
|
.ifc \type, qpel_bi |
|
ldrh w8, [sp] // width |
|
mov x16, #(MAX_PB_SIZE << 2) // src2bstridel |
|
lsl x17, x5, #7 // src2b reset |
|
add x15, x4, #(MAX_PB_SIZE << 1) // src2b |
|
.endif |
|
sub src, src, #3 |
|
mov mx, x30 |
|
.ifc \type, qpel |
|
mov dststride, #(MAX_PB_SIZE << 1) |
|
lsl x13, srcstride, #1 // srcstridel |
|
mov x14, #((MAX_PB_SIZE << 2) - 16) |
|
.else |
|
lsl x14, dststride, #1 // dststridel |
|
lsl x13, srcstride, #1 // srcstridel |
|
sub x14, x14, #8 |
|
.endif |
|
add x10, dst, dststride // dstb |
|
add x12, src, srcstride // srcb |
|
0: mov x9, height |
|
1: ld1 {v16.8b-v18.8b}, [src], x13 |
|
ld1 {v19.8b-v21.8b}, [x12], x13 |
|
|
|
bl ff_hevc_put_hevc_h16_8_neon |
|
|
|
.ifc \type, qpel |
|
st1 {v26.8h}, [dst], #16 |
|
st1 {v28.8h}, [x10], #16 |
|
st1 {v27.8h}, [dst], x14 |
|
st1 {v29.8h}, [x10], x14 |
|
.else |
|
.ifc \type, qpel_bi |
|
ld1 {v16.8h, v17.8h}, [ x4], x16 |
|
ld1 {v18.8h, v19.8h}, [x15], x16 |
|
sqadd v26.8h, v26.8h, v16.8h |
|
sqadd v27.8h, v27.8h, v17.8h |
|
sqadd v28.8h, v28.8h, v18.8h |
|
sqadd v29.8h, v29.8h, v19.8h |
|
sqrshrun v26.8b, v26.8h, #7 |
|
sqrshrun v27.8b, v27.8h, #7 |
|
sqrshrun v28.8b, v28.8h, #7 |
|
sqrshrun v29.8b, v29.8h, #7 |
|
.else |
|
sqrshrun v26.8b, v26.8h, #6 |
|
sqrshrun v27.8b, v27.8h, #6 |
|
sqrshrun v28.8b, v28.8h, #6 |
|
sqrshrun v29.8b, v29.8h, #6 |
|
.endif |
|
st1 {v26.8b}, [dst], #8 |
|
st1 {v28.8b}, [x10], #8 |
|
st1 {v27.8b}, [dst], x14 |
|
st1 {v29.8b}, [x10], x14 |
|
.endif |
|
b.gt 1b // double line |
|
subs width, width, #16 |
|
// reset src |
|
msub src, srcstride, height, src |
|
msub x12, srcstride, height, x12 |
|
// reset dst |
|
msub dst, dststride, height, dst |
|
msub x10, dststride, height, x10 |
|
.ifc \type, qpel_bi |
|
// reset xsrc |
|
sub x4, x4, x17 |
|
sub x15, x15, x17 |
|
add x4, x4, #32 |
|
add x15, x15, #32 |
|
.endif |
|
add src, src, #16 |
|
add x12, x12, #16 |
|
.ifc \type, qpel |
|
add dst, dst, #32 |
|
add x10, x10, #32 |
|
.else |
|
add dst, dst, #16 |
|
add x10, x10, #16 |
|
.endif |
|
b.gt 0b |
|
ret mx |
|
endfunc |
|
|
|
.unreq height |
|
.unreq heightw |
|
.unreq width |
|
.unreq src |
|
.unreq dst |
|
.unreq srcstride |
|
.unreq dststride |
|
.unreq mx |
|
.endm |
|
|
|
put_hevc qpel |
|
put_hevc qpel_uni |
|
put_hevc qpel_bi
|
|
|