You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

485 lines
16 KiB

lavc/aarch64: add hevc horizontal qpel/uni/bi checkasm --benchmark on Ampere Altra (Neoverse N1): put_hevc_qpel_bi_h4_8_c: 170.7 put_hevc_qpel_bi_h4_8_neon: 64.5 put_hevc_qpel_bi_h6_8_c: 373.7 put_hevc_qpel_bi_h6_8_neon: 130.2 put_hevc_qpel_bi_h8_8_c: 662.0 put_hevc_qpel_bi_h8_8_neon: 138.5 put_hevc_qpel_bi_h12_8_c: 1529.5 put_hevc_qpel_bi_h12_8_neon: 422.0 put_hevc_qpel_bi_h16_8_c: 2735.5 put_hevc_qpel_bi_h16_8_neon: 560.5 put_hevc_qpel_bi_h24_8_c: 6015.7 put_hevc_qpel_bi_h24_8_neon: 1636.0 put_hevc_qpel_bi_h32_8_c: 10779.0 put_hevc_qpel_bi_h32_8_neon: 2204.5 put_hevc_qpel_bi_h48_8_c: 24375.0 put_hevc_qpel_bi_h48_8_neon: 4984.0 put_hevc_qpel_bi_h64_8_c: 42768.0 put_hevc_qpel_bi_h64_8_neon: 8795.7 put_hevc_qpel_h4_8_c: 149.0 put_hevc_qpel_h4_8_neon: 55.7 put_hevc_qpel_h6_8_c: 321.2 put_hevc_qpel_h6_8_neon: 106.0 put_hevc_qpel_h8_8_c: 578.7 put_hevc_qpel_h8_8_neon: 133.2 put_hevc_qpel_h12_8_c: 1279.0 put_hevc_qpel_h12_8_neon: 391.7 put_hevc_qpel_h16_8_c: 2286.2 put_hevc_qpel_h16_8_neon: 519.7 put_hevc_qpel_h24_8_c: 5100.7 put_hevc_qpel_h24_8_neon: 1546.2 put_hevc_qpel_h32_8_c: 9022.0 put_hevc_qpel_h32_8_neon: 2060.2 put_hevc_qpel_h48_8_c: 20293.5 put_hevc_qpel_h48_8_neon: 4656.7 put_hevc_qpel_h64_8_c: 36037.0 put_hevc_qpel_h64_8_neon: 8262.7 put_hevc_qpel_uni_h4_8_c: 162.2 put_hevc_qpel_uni_h4_8_neon: 61.7 put_hevc_qpel_uni_h6_8_c: 355.2 put_hevc_qpel_uni_h6_8_neon: 114.2 put_hevc_qpel_uni_h8_8_c: 651.0 put_hevc_qpel_uni_h8_8_neon: 135.7 put_hevc_qpel_uni_h12_8_c: 1412.5 put_hevc_qpel_uni_h12_8_neon: 402.7 put_hevc_qpel_uni_h16_8_c: 2551.0 put_hevc_qpel_uni_h16_8_neon: 533.5 put_hevc_qpel_uni_h24_8_c: 5782.2 put_hevc_qpel_uni_h24_8_neon: 1578.7 put_hevc_qpel_uni_h32_8_c: 10586.5 put_hevc_qpel_uni_h32_8_neon: 2102.2 put_hevc_qpel_uni_h48_8_c: 23812.0 put_hevc_qpel_uni_h48_8_neon: 4739.5 put_hevc_qpel_uni_h64_8_c: 42958.7 put_hevc_qpel_uni_h64_8_neon: 8366.5 Signed-off-by: J. Dekker <jdek@itanimul.li>
2 years ago
/* -*-arm64-*-
* vim: syntax=arm64asm
*
* Copyright (c) 2022 J. Dekker <jdek@itanimul.li>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/aarch64/asm.S"
#define MAX_PB_SIZE 64
const qpel_filters, align=4
.byte 0, 0, 0, 0, 0, 0, 0, 0
.byte -1, 4,-10, 58, 17, -5, 1, 0
.byte -1, 4,-11, 40, 40,-11, 4, -1
.byte 0, 1, -5, 17, 58,-10, 4, -1
endconst
.macro load_filter m
movrel x15, qpel_filters
add x15, x15, \m, lsl #3
ld1 {v0.8b}, [x15]
sxtl v0.8h, v0.8b
.endm
.macro put_hevc type
.ifc \type, qpel
// void put_hevc_qpel_h(int16_t *dst,
// uint8_t *_src, ptrdiff_t _srcstride,
// int height, intptr_t mx, intptr_t my, int width)
dst .req x0
dststride .req x7
src .req x1
srcstride .req x2
height .req x3
heightw .req w3
mx .req x4
width .req w6
.endif
.ifc \type, qpel_uni
// void put_hevc_qpel_uni_h(uint8_t *_dst, ptrdiff_t _dststride,
// uint8_t *_src, ptrdiff_t _srcstride,
// int height, intptr_t mx, intptr_t my, int width)
dst .req x0
dststride .req x1
src .req x2
srcstride .req x3
height .req x4
heightw .req w4
mx .req x5
width .req w7
.endif
.ifc \type, qpel_bi
// void put_hevc_qpel_bi_h(uint8_t *_dst, ptrdiff_t _dststride,
// uint8_t *_src, ptrdiff_t _srcstride,
// int16_t *src2, int height, intptr_t mx,
// intptr_t my, int width)
dst .req x0
dststride .req x1
src .req x2
srcstride .req x3
height .req x5
heightw .req w5
mx .req x6
width .req w8
.endif
.ifc \type, qpel
function ff_hevc_put_hevc_h4_8_neon, export=0
uxtl v16.8h, v16.8b
uxtl v17.8h, v17.8b
uxtl v18.8h, v18.8b
uxtl v19.8h, v19.8b
mul v23.4h, v16.4h, v0.h[0]
mul v24.4h, v18.4h, v0.h[0]
.irpc i, 1234567
ext v20.16b, v16.16b, v17.16b, #(2*\i)
ext v21.16b, v18.16b, v19.16b, #(2*\i)
mla v23.4h, v20.4h, v0.h[\i]
mla v24.4h, v21.4h, v0.h[\i]
.endr
ret
endfunc
.endif
function ff_hevc_put_hevc_\type\()_h4_8_neon, export=1
load_filter mx
.ifc \type, qpel_bi
mov x16, #(MAX_PB_SIZE << 2) // src2bstridel
add x15, x4, #(MAX_PB_SIZE << 1) // src2b
.endif
sub src, src, #3
mov mx, x30
.ifc \type, qpel
mov dststride, #(MAX_PB_SIZE << 1)
lsl x13, srcstride, #1 // srcstridel
mov x14, #(MAX_PB_SIZE << 2)
.else
lsl x14, dststride, #1 // dststridel
lsl x13, srcstride, #1 // srcstridel
.endif
add x10, dst, dststride // dstb
add x12, src, srcstride // srcb
0: ld1 {v16.8b, v17.8b}, [src], x13
ld1 {v18.8b, v19.8b}, [x12], x13
.ifc \type, qpel_bi
ld1 {v25.8h}, [ x4], x16
ld1 {v26.8h}, [x15], x16
.endif
bl ff_hevc_put_hevc_h4_8_neon
subs heightw, heightw, #2
.ifc \type, qpel
st1 {v23.4h}, [dst], x14
st1 {v24.4h}, [x10], x14
.else
.ifc \type, qpel_bi
sqadd v23.4h, v23.4h, v25.4h
sqadd v24.4h, v24.4h, v26.4h
sqrshrun v23.8b, v23.8h, #7
sqrshrun v24.8b, v24.8h, #7
.else
sqrshrun v23.8b, v23.8h, #6
sqrshrun v24.8b, v24.8h, #6
.endif
st1 {v23.s}[0], [dst], x14
st1 {v24.s}[0], [x10], x14
.endif
b.gt 0b // double line
ret mx
endfunc
.ifc \type, qpel
function ff_hevc_put_hevc_h8_8_neon, export=0
uxtl v16.8h, v16.8b
uxtl v17.8h, v17.8b
uxtl v18.8h, v18.8b
uxtl v19.8h, v19.8b
mul v23.8h, v16.8h, v0.h[0]
mul v24.8h, v18.8h, v0.h[0]
.irpc i, 1234567
ext v20.16b, v16.16b, v17.16b, #(2*\i)
ext v21.16b, v18.16b, v19.16b, #(2*\i)
mla v23.8h, v20.8h, v0.h[\i]
mla v24.8h, v21.8h, v0.h[\i]
.endr
ret
endfunc
.endif
function ff_hevc_put_hevc_\type\()_h6_8_neon, export=1
load_filter mx
.ifc \type, qpel_bi
mov x16, #(MAX_PB_SIZE << 2) // src2bstridel
add x15, x4, #(MAX_PB_SIZE << 1) // src2b
.endif
sub src, src, #3
mov mx, x30
.ifc \type, qpel
mov dststride, #(MAX_PB_SIZE << 1)
lsl x13, srcstride, #1 // srcstridel
mov x14, #((MAX_PB_SIZE << 2) - 8)
.else
lsl x14, dststride, #1 // dststridel
lsl x13, srcstride, #1 // srcstridel
sub x14, x14, #4
.endif
add x10, dst, dststride // dstb
add x12, src, srcstride // srcb
0: ld1 {v16.8b, v17.8b}, [src], x13
ld1 {v18.8b, v19.8b}, [x12], x13
.ifc \type, qpel_bi
ld1 {v25.8h}, [ x4], x16
ld1 {v26.8h}, [x15], x16
.endif
bl ff_hevc_put_hevc_h8_8_neon
subs heightw, heightw, #2
.ifc \type, qpel
st1 {v23.4h}, [dst], #8
st1 {v24.4h}, [x10], #8
st1 {v23.s}[2], [dst], x14
st1 {v24.s}[2], [x10], x14
.else
.ifc \type, qpel_bi
sqadd v23.8h, v23.8h, v25.8h
sqadd v24.8h, v24.8h, v26.8h
sqrshrun v23.8b, v23.8h, #7
sqrshrun v24.8b, v24.8h, #7
.else
sqrshrun v23.8b, v23.8h, #6
sqrshrun v24.8b, v24.8h, #6
.endif
st1 {v23.s}[0], [dst], #4
st1 {v24.s}[0], [x10], #4
st1 {v23.h}[2], [dst], x14
st1 {v24.h}[2], [x10], x14
.endif
b.gt 0b // double line
ret mx
endfunc
function ff_hevc_put_hevc_\type\()_h8_8_neon, export=1
load_filter mx
.ifc \type, qpel_bi
mov x16, #(MAX_PB_SIZE << 2) // src2bstridel
add x15, x4, #(MAX_PB_SIZE << 1) // src2b
.endif
sub src, src, #3
mov mx, x30
.ifc \type, qpel
mov dststride, #(MAX_PB_SIZE << 1)
lsl x13, srcstride, #1 // srcstridel
mov x14, #(MAX_PB_SIZE << 2)
.else
lsl x14, dststride, #1 // dststridel
lsl x13, srcstride, #1 // srcstridel
.endif
add x10, dst, dststride // dstb
add x12, src, srcstride // srcb
0: ld1 {v16.8b, v17.8b}, [src], x13
ld1 {v18.8b, v19.8b}, [x12], x13
.ifc \type, qpel_bi
ld1 {v25.8h}, [ x4], x16
ld1 {v26.8h}, [x15], x16
.endif
bl ff_hevc_put_hevc_h8_8_neon
subs heightw, heightw, #2
.ifc \type, qpel
st1 {v23.8h}, [dst], x14
st1 {v24.8h}, [x10], x14
.else
.ifc \type, qpel_bi
sqadd v23.8h, v23.8h, v25.8h
sqadd v24.8h, v24.8h, v26.8h
sqrshrun v23.8b, v23.8h, #7
sqrshrun v24.8b, v24.8h, #7
.else
sqrshrun v23.8b, v23.8h, #6
sqrshrun v24.8b, v24.8h, #6
.endif
st1 {v23.8b}, [dst], x14
st1 {v24.8b}, [x10], x14
.endif
b.gt 0b // double line
ret mx
endfunc
.ifc \type, qpel
function ff_hevc_put_hevc_h16_8_neon, export=0
uxtl v16.8h, v16.8b
uxtl v17.8h, v17.8b
uxtl v18.8h, v18.8b
uxtl v19.8h, v19.8b
uxtl v20.8h, v20.8b
uxtl v21.8h, v21.8b
mul v26.8h, v16.8h, v0.h[0]
mul v27.8h, v17.8h, v0.h[0]
mul v28.8h, v19.8h, v0.h[0]
mul v29.8h, v20.8h, v0.h[0]
.irpc i, 1234567
ext v22.16b, v16.16b, v17.16b, #(2*\i)
ext v23.16b, v17.16b, v18.16b, #(2*\i)
ext v24.16b, v19.16b, v20.16b, #(2*\i)
ext v25.16b, v20.16b, v21.16b, #(2*\i)
mla v26.8h, v22.8h, v0.h[\i]
mla v27.8h, v23.8h, v0.h[\i]
mla v28.8h, v24.8h, v0.h[\i]
mla v29.8h, v25.8h, v0.h[\i]
.endr
subs x9, x9, #2
ret
endfunc
.endif
function ff_hevc_put_hevc_\type\()_h12_8_neon, export=1
load_filter mx
sxtw height, heightw
.ifc \type, qpel_bi
ldrh w8, [sp] // width
mov x16, #(MAX_PB_SIZE << 2) // src2bstridel
lsl x17, height, #7 // src2b reset (height * (MAX_PB_SIZE << 1))
add x15, x4, #(MAX_PB_SIZE << 1) // src2b
.endif
sub src, src, #3
mov mx, x30
.ifc \type, qpel
mov dststride, #(MAX_PB_SIZE << 1)
lsl x13, srcstride, #1 // srcstridel
mov x14, #((MAX_PB_SIZE << 2) - 16)
.else
lsl x14, dststride, #1 // dststridel
lsl x13, srcstride, #1 // srcstridel
sub x14, x14, #8
.endif
add x10, dst, dststride // dstb
add x12, src, srcstride // srcb
0: mov x9, height
1: ld1 {v16.8b-v18.8b}, [src], x13
ld1 {v19.8b-v21.8b}, [x12], x13
bl ff_hevc_put_hevc_h16_8_neon
.ifc \type, qpel
st1 {v26.8h}, [dst], #16
st1 {v28.8h}, [x10], #16
st1 {v27.4h}, [dst], x14
st1 {v29.4h}, [x10], x14
.else
.ifc \type, qpel_bi
ld1 {v16.8h, v17.8h}, [ x4], x16
ld1 {v18.8h, v19.8h}, [x15], x16
sqadd v26.8h, v26.8h, v16.8h
sqadd v27.8h, v27.8h, v17.8h
sqadd v28.8h, v28.8h, v18.8h
sqadd v29.8h, v29.8h, v19.8h
sqrshrun v26.8b, v26.8h, #7
sqrshrun v27.8b, v27.8h, #7
sqrshrun v28.8b, v28.8h, #7
sqrshrun v29.8b, v29.8h, #7
.else
sqrshrun v26.8b, v26.8h, #6
sqrshrun v27.8b, v27.8h, #6
sqrshrun v28.8b, v28.8h, #6
sqrshrun v29.8b, v29.8h, #6
.endif
st1 {v26.8b}, [dst], #8
st1 {v28.8b}, [x10], #8
st1 {v27.s}[0], [dst], x14
st1 {v29.s}[0], [x10], x14
.endif
b.gt 1b // double line
subs width, width, #12
// reset src
msub src, srcstride, height, src
msub x12, srcstride, height, x12
// reset dst
msub dst, dststride, height, dst
msub x10, dststride, height, x10
.ifc \type, qpel_bi
// reset xsrc
sub x4, x4, x17
sub x15, x15, x17
add x4, x4, #24
add x15, x15, #24
.endif
add src, src, #12
add x12, x12, #12
.ifc \type, qpel
add dst, dst, #24
add x10, x10, #24
.else
add dst, dst, #12
add x10, x10, #12
.endif
b.gt 0b
ret mx
endfunc
function ff_hevc_put_hevc_\type\()_h16_8_neon, export=1
load_filter mx
sxtw height, heightw
mov mx, x30
.ifc \type, qpel_bi
ldrh w8, [sp] // width
mov x16, #(MAX_PB_SIZE << 2) // src2bstridel
lsl x17, x5, #7 // src2b reset
add x15, x4, #(MAX_PB_SIZE << 1) // src2b
.endif
sub src, src, #3
mov mx, x30
.ifc \type, qpel
mov dststride, #(MAX_PB_SIZE << 1)
lsl x13, srcstride, #1 // srcstridel
mov x14, #((MAX_PB_SIZE << 2) - 16)
.else
lsl x14, dststride, #1 // dststridel
lsl x13, srcstride, #1 // srcstridel
sub x14, x14, #8
.endif
add x10, dst, dststride // dstb
add x12, src, srcstride // srcb
0: mov x9, height
1: ld1 {v16.8b-v18.8b}, [src], x13
ld1 {v19.8b-v21.8b}, [x12], x13
bl ff_hevc_put_hevc_h16_8_neon
.ifc \type, qpel
st1 {v26.8h}, [dst], #16
st1 {v28.8h}, [x10], #16
st1 {v27.8h}, [dst], x14
st1 {v29.8h}, [x10], x14
.else
.ifc \type, qpel_bi
ld1 {v16.8h, v17.8h}, [ x4], x16
ld1 {v18.8h, v19.8h}, [x15], x16
sqadd v26.8h, v26.8h, v16.8h
sqadd v27.8h, v27.8h, v17.8h
sqadd v28.8h, v28.8h, v18.8h
sqadd v29.8h, v29.8h, v19.8h
sqrshrun v26.8b, v26.8h, #7
sqrshrun v27.8b, v27.8h, #7
sqrshrun v28.8b, v28.8h, #7
sqrshrun v29.8b, v29.8h, #7
.else
sqrshrun v26.8b, v26.8h, #6
sqrshrun v27.8b, v27.8h, #6
sqrshrun v28.8b, v28.8h, #6
sqrshrun v29.8b, v29.8h, #6
.endif
st1 {v26.8b}, [dst], #8
st1 {v28.8b}, [x10], #8
st1 {v27.8b}, [dst], x14
st1 {v29.8b}, [x10], x14
.endif
b.gt 1b // double line
subs width, width, #16
// reset src
msub src, srcstride, height, src
msub x12, srcstride, height, x12
// reset dst
msub dst, dststride, height, dst
msub x10, dststride, height, x10
.ifc \type, qpel_bi
// reset xsrc
sub x4, x4, x17
sub x15, x15, x17
add x4, x4, #32
add x15, x15, #32
.endif
add src, src, #16
add x12, x12, #16
.ifc \type, qpel
add dst, dst, #32
add x10, x10, #32
.else
add dst, dst, #16
add x10, x10, #16
.endif
b.gt 0b
ret mx
endfunc
.unreq height
.unreq heightw
.unreq width
.unreq src
.unreq dst
.unreq srcstride
.unreq dststride
.unreq mx
.endm
put_hevc qpel
put_hevc qpel_uni
put_hevc qpel_bi