mirror of https://github.com/FFmpeg/FFmpeg.git
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
829 lines
30 KiB
829 lines
30 KiB
/* |
|
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com> |
|
* Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net> |
|
* Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net> |
|
* |
|
* This file is part of FFmpeg. |
|
* |
|
* FFmpeg is free software; you can redistribute it and/or |
|
* modify it under the terms of the GNU Lesser General Public |
|
* License as published by the Free Software Foundation; either |
|
* version 2.1 of the License, or (at your option) any later version. |
|
* |
|
* FFmpeg is distributed in the hope that it will be useful, |
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
|
* Lesser General Public License for more details. |
|
* |
|
* You should have received a copy of the GNU Lesser General Public |
|
* License along with FFmpeg; if not, write to the Free Software |
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
|
*/ |
|
|
|
#include "libavutil/aarch64/asm.S" |
|
#include "neon.S" |
|
|
|
.macro h264_loop_filter_start |
|
cmp w2, #0 |
|
ldr w6, [x4] |
|
ccmp w3, #0, #0, ne |
|
mov v24.S[0], w6 |
|
and w8, w6, w6, lsl #16 |
|
b.eq 1f |
|
ands w8, w8, w8, lsl #8 |
|
b.ge 2f |
|
1: |
|
ret |
|
2: |
|
.endm |
|
|
|
.macro h264_loop_filter_luma |
|
dup v22.16B, w2 // alpha |
|
uxtl v24.8H, v24.8B |
|
uabd v21.16B, v16.16B, v0.16B // abs(p0 - q0) |
|
uxtl v24.4S, v24.4H |
|
uabd v28.16B, v18.16B, v16.16B // abs(p1 - p0) |
|
sli v24.8H, v24.8H, #8 |
|
uabd v30.16B, v2.16B, v0.16B // abs(q1 - q0) |
|
sli v24.4S, v24.4S, #16 |
|
cmhi v21.16B, v22.16B, v21.16B // < alpha |
|
dup v22.16B, w3 // beta |
|
cmlt v23.16B, v24.16B, #0 |
|
cmhi v28.16B, v22.16B, v28.16B // < beta |
|
cmhi v30.16B, v22.16B, v30.16B // < beta |
|
bic v21.16B, v21.16B, v23.16B |
|
uabd v17.16B, v20.16B, v16.16B // abs(p2 - p0) |
|
and v21.16B, v21.16B, v28.16B |
|
uabd v19.16B, v4.16B, v0.16B // abs(q2 - q0) |
|
and v21.16B, v21.16B, v30.16B // < beta |
|
shrn v30.8b, v21.8h, #4 |
|
mov x7, v30.d[0] |
|
cmhi v17.16B, v22.16B, v17.16B // < beta |
|
cmhi v19.16B, v22.16B, v19.16B // < beta |
|
cbz x7, 9f |
|
and v17.16B, v17.16B, v21.16B |
|
and v19.16B, v19.16B, v21.16B |
|
and v24.16B, v24.16B, v21.16B |
|
urhadd v28.16B, v16.16B, v0.16B |
|
sub v21.16B, v24.16B, v17.16B |
|
uqadd v23.16B, v18.16B, v24.16B |
|
uhadd v20.16B, v20.16B, v28.16B |
|
sub v21.16B, v21.16B, v19.16B |
|
uhadd v28.16B, v4.16B, v28.16B |
|
umin v23.16B, v23.16B, v20.16B |
|
uqsub v22.16B, v18.16B, v24.16B |
|
uqadd v4.16B, v2.16B, v24.16B |
|
umax v23.16B, v23.16B, v22.16B |
|
uqsub v22.16B, v2.16B, v24.16B |
|
umin v28.16B, v4.16B, v28.16B |
|
uxtl v4.8H, v0.8B |
|
umax v28.16B, v28.16B, v22.16B |
|
uxtl2 v20.8H, v0.16B |
|
usubw v4.8H, v4.8H, v16.8B |
|
usubw2 v20.8H, v20.8H, v16.16B |
|
shl v4.8H, v4.8H, #2 |
|
shl v20.8H, v20.8H, #2 |
|
uaddw v4.8H, v4.8H, v18.8B |
|
uaddw2 v20.8H, v20.8H, v18.16B |
|
usubw v4.8H, v4.8H, v2.8B |
|
usubw2 v20.8H, v20.8H, v2.16B |
|
rshrn v4.8B, v4.8H, #3 |
|
rshrn2 v4.16B, v20.8H, #3 |
|
bsl v17.16B, v23.16B, v18.16B |
|
bsl v19.16B, v28.16B, v2.16B |
|
neg v23.16B, v21.16B |
|
uxtl v28.8H, v16.8B |
|
smin v4.16B, v4.16B, v21.16B |
|
uxtl2 v21.8H, v16.16B |
|
smax v4.16B, v4.16B, v23.16B |
|
uxtl v22.8H, v0.8B |
|
uxtl2 v24.8H, v0.16B |
|
saddw v28.8H, v28.8H, v4.8B |
|
saddw2 v21.8H, v21.8H, v4.16B |
|
ssubw v22.8H, v22.8H, v4.8B |
|
ssubw2 v24.8H, v24.8H, v4.16B |
|
sqxtun v16.8B, v28.8H |
|
sqxtun2 v16.16B, v21.8H |
|
sqxtun v0.8B, v22.8H |
|
sqxtun2 v0.16B, v24.8H |
|
.endm |
|
|
|
function ff_h264_v_loop_filter_luma_neon, export=1 |
|
h264_loop_filter_start |
|
sxtw x1, w1 |
|
|
|
ld1 {v0.16B}, [x0], x1 |
|
ld1 {v2.16B}, [x0], x1 |
|
ld1 {v4.16B}, [x0], x1 |
|
sub x0, x0, x1, lsl #2 |
|
sub x0, x0, x1, lsl #1 |
|
ld1 {v20.16B}, [x0], x1 |
|
ld1 {v18.16B}, [x0], x1 |
|
ld1 {v16.16B}, [x0], x1 |
|
|
|
h264_loop_filter_luma |
|
|
|
sub x0, x0, x1, lsl #1 |
|
st1 {v17.16B}, [x0], x1 |
|
st1 {v16.16B}, [x0], x1 |
|
st1 {v0.16B}, [x0], x1 |
|
st1 {v19.16B}, [x0] |
|
9: |
|
ret |
|
endfunc |
|
|
|
function ff_h264_h_loop_filter_luma_neon, export=1 |
|
h264_loop_filter_start |
|
sxtw x1, w1 |
|
|
|
sub x0, x0, #4 |
|
ld1 {v6.8B}, [x0], x1 |
|
ld1 {v20.8B}, [x0], x1 |
|
ld1 {v18.8B}, [x0], x1 |
|
ld1 {v16.8B}, [x0], x1 |
|
ld1 {v0.8B}, [x0], x1 |
|
ld1 {v2.8B}, [x0], x1 |
|
ld1 {v4.8B}, [x0], x1 |
|
ld1 {v26.8B}, [x0], x1 |
|
ld1 {v6.D}[1], [x0], x1 |
|
ld1 {v20.D}[1], [x0], x1 |
|
ld1 {v18.D}[1], [x0], x1 |
|
ld1 {v16.D}[1], [x0], x1 |
|
ld1 {v0.D}[1], [x0], x1 |
|
ld1 {v2.D}[1], [x0], x1 |
|
ld1 {v4.D}[1], [x0], x1 |
|
ld1 {v26.D}[1], [x0], x1 |
|
|
|
transpose_8x16B v6, v20, v18, v16, v0, v2, v4, v26, v21, v23 |
|
|
|
h264_loop_filter_luma |
|
|
|
transpose_4x16B v17, v16, v0, v19, v21, v23, v25, v27 |
|
|
|
sub x0, x0, x1, lsl #4 |
|
add x0, x0, #2 |
|
st1 {v17.S}[0], [x0], x1 |
|
st1 {v16.S}[0], [x0], x1 |
|
st1 {v0.S}[0], [x0], x1 |
|
st1 {v19.S}[0], [x0], x1 |
|
st1 {v17.S}[1], [x0], x1 |
|
st1 {v16.S}[1], [x0], x1 |
|
st1 {v0.S}[1], [x0], x1 |
|
st1 {v19.S}[1], [x0], x1 |
|
st1 {v17.S}[2], [x0], x1 |
|
st1 {v16.S}[2], [x0], x1 |
|
st1 {v0.S}[2], [x0], x1 |
|
st1 {v19.S}[2], [x0], x1 |
|
st1 {v17.S}[3], [x0], x1 |
|
st1 {v16.S}[3], [x0], x1 |
|
st1 {v0.S}[3], [x0], x1 |
|
st1 {v19.S}[3], [x0], x1 |
|
9: |
|
ret |
|
endfunc |
|
|
|
|
|
.macro h264_loop_filter_start_intra |
|
orr w4, w2, w3 |
|
cbnz w4, 1f |
|
ret |
|
1: |
|
sxtw x1, w1 |
|
dup v30.16b, w2 // alpha |
|
dup v31.16b, w3 // beta |
|
.endm |
|
|
|
.macro h264_loop_filter_luma_intra |
|
uabd v16.16b, v7.16b, v0.16b // abs(p0 - q0) |
|
uabd v17.16b, v6.16b, v7.16b // abs(p1 - p0) |
|
uabd v18.16b, v1.16b, v0.16b // abs(q1 - q0) |
|
cmhi v19.16b, v30.16b, v16.16b // < alpha |
|
cmhi v17.16b, v31.16b, v17.16b // < beta |
|
cmhi v18.16b, v31.16b, v18.16b // < beta |
|
|
|
movi v29.16b, #2 |
|
ushr v30.16b, v30.16b, #2 // alpha >> 2 |
|
add v30.16b, v30.16b, v29.16b // (alpha >> 2) + 2 |
|
cmhi v16.16b, v30.16b, v16.16b // < (alpha >> 2) + 2 |
|
|
|
and v19.16b, v19.16b, v17.16b |
|
and v19.16b, v19.16b, v18.16b |
|
shrn v20.8b, v19.8h, #4 |
|
mov x4, v20.d[0] |
|
cbz x4, 9f |
|
|
|
ushll v20.8h, v6.8b, #1 |
|
ushll v22.8h, v1.8b, #1 |
|
ushll2 v21.8h, v6.16b, #1 |
|
ushll2 v23.8h, v1.16b, #1 |
|
uaddw v20.8h, v20.8h, v7.8b |
|
uaddw v22.8h, v22.8h, v0.8b |
|
uaddw2 v21.8h, v21.8h, v7.16b |
|
uaddw2 v23.8h, v23.8h, v0.16b |
|
uaddw v20.8h, v20.8h, v1.8b |
|
uaddw v22.8h, v22.8h, v6.8b |
|
uaddw2 v21.8h, v21.8h, v1.16b |
|
uaddw2 v23.8h, v23.8h, v6.16b |
|
|
|
rshrn v24.8b, v20.8h, #2 // p0'_1 |
|
rshrn v25.8b, v22.8h, #2 // q0'_1 |
|
rshrn2 v24.16b, v21.8h, #2 // p0'_1 |
|
rshrn2 v25.16b, v23.8h, #2 // q0'_1 |
|
|
|
uabd v17.16b, v5.16b, v7.16b // abs(p2 - p0) |
|
uabd v18.16b, v2.16b, v0.16b // abs(q2 - q0) |
|
cmhi v17.16b, v31.16b, v17.16b // < beta |
|
cmhi v18.16b, v31.16b, v18.16b // < beta |
|
|
|
and v17.16b, v16.16b, v17.16b // if_2 && if_3 |
|
and v18.16b, v16.16b, v18.16b // if_2 && if_4 |
|
|
|
not v30.16b, v17.16b |
|
not v31.16b, v18.16b |
|
|
|
and v30.16b, v30.16b, v19.16b // if_1 && !(if_2 && if_3) |
|
and v31.16b, v31.16b, v19.16b // if_1 && !(if_2 && if_4) |
|
|
|
and v17.16b, v19.16b, v17.16b // if_1 && if_2 && if_3 |
|
and v18.16b, v19.16b, v18.16b // if_1 && if_2 && if_4 |
|
|
|
//calc p, v7, v6, v5, v4, v17, v7, v6, v5, v4 |
|
uaddl v26.8h, v5.8b, v7.8b |
|
uaddl2 v27.8h, v5.16b, v7.16b |
|
uaddw v26.8h, v26.8h, v0.8b |
|
uaddw2 v27.8h, v27.8h, v0.16b |
|
add v20.8h, v20.8h, v26.8h |
|
add v21.8h, v21.8h, v27.8h |
|
uaddw v20.8h, v20.8h, v0.8b |
|
uaddw2 v21.8h, v21.8h, v0.16b |
|
rshrn v20.8b, v20.8h, #3 // p0'_2 |
|
rshrn2 v20.16b, v21.8h, #3 // p0'_2 |
|
uaddw v26.8h, v26.8h, v6.8b |
|
uaddw2 v27.8h, v27.8h, v6.16b |
|
rshrn v21.8b, v26.8h, #2 // p1'_2 |
|
rshrn2 v21.16b, v27.8h, #2 // p1'_2 |
|
uaddl v28.8h, v4.8b, v5.8b |
|
uaddl2 v29.8h, v4.16b, v5.16b |
|
shl v28.8h, v28.8h, #1 |
|
shl v29.8h, v29.8h, #1 |
|
add v28.8h, v28.8h, v26.8h |
|
add v29.8h, v29.8h, v27.8h |
|
rshrn v19.8b, v28.8h, #3 // p2'_2 |
|
rshrn2 v19.16b, v29.8h, #3 // p2'_2 |
|
|
|
//calc q, v0, v1, v2, v3, v18, v0, v1, v2, v3 |
|
uaddl v26.8h, v2.8b, v0.8b |
|
uaddl2 v27.8h, v2.16b, v0.16b |
|
uaddw v26.8h, v26.8h, v7.8b |
|
uaddw2 v27.8h, v27.8h, v7.16b |
|
add v22.8h, v22.8h, v26.8h |
|
add v23.8h, v23.8h, v27.8h |
|
uaddw v22.8h, v22.8h, v7.8b |
|
uaddw2 v23.8h, v23.8h, v7.16b |
|
rshrn v22.8b, v22.8h, #3 // q0'_2 |
|
rshrn2 v22.16b, v23.8h, #3 // q0'_2 |
|
uaddw v26.8h, v26.8h, v1.8b |
|
uaddw2 v27.8h, v27.8h, v1.16b |
|
rshrn v23.8b, v26.8h, #2 // q1'_2 |
|
rshrn2 v23.16b, v27.8h, #2 // q1'_2 |
|
uaddl v28.8h, v2.8b, v3.8b |
|
uaddl2 v29.8h, v2.16b, v3.16b |
|
shl v28.8h, v28.8h, #1 |
|
shl v29.8h, v29.8h, #1 |
|
add v28.8h, v28.8h, v26.8h |
|
add v29.8h, v29.8h, v27.8h |
|
rshrn v26.8b, v28.8h, #3 // q2'_2 |
|
rshrn2 v26.16b, v29.8h, #3 // q2'_2 |
|
|
|
bit v7.16b, v24.16b, v30.16b // p0'_1 |
|
bit v0.16b, v25.16b, v31.16b // q0'_1 |
|
bit v7.16b, v20.16b, v17.16b // p0'_2 |
|
bit v6.16b, v21.16b, v17.16b // p1'_2 |
|
bit v5.16b, v19.16b, v17.16b // p2'_2 |
|
bit v0.16b, v22.16b, v18.16b // q0'_2 |
|
bit v1.16b, v23.16b, v18.16b // q1'_2 |
|
bit v2.16b, v26.16b, v18.16b // q2'_2 |
|
.endm |
|
|
|
function ff_h264_v_loop_filter_luma_intra_neon, export=1 |
|
h264_loop_filter_start_intra |
|
|
|
ld1 {v0.16b}, [x0], x1 // q0 |
|
ld1 {v1.16b}, [x0], x1 // q1 |
|
ld1 {v2.16b}, [x0], x1 // q2 |
|
ld1 {v3.16b}, [x0], x1 // q3 |
|
sub x0, x0, x1, lsl #3 |
|
ld1 {v4.16b}, [x0], x1 // p3 |
|
ld1 {v5.16b}, [x0], x1 // p2 |
|
ld1 {v6.16b}, [x0], x1 // p1 |
|
ld1 {v7.16b}, [x0] // p0 |
|
|
|
h264_loop_filter_luma_intra |
|
|
|
sub x0, x0, x1, lsl #1 |
|
st1 {v5.16b}, [x0], x1 // p2 |
|
st1 {v6.16b}, [x0], x1 // p1 |
|
st1 {v7.16b}, [x0], x1 // p0 |
|
st1 {v0.16b}, [x0], x1 // q0 |
|
st1 {v1.16b}, [x0], x1 // q1 |
|
st1 {v2.16b}, [x0] // q2 |
|
9: |
|
ret |
|
endfunc |
|
|
|
function ff_h264_h_loop_filter_luma_intra_neon, export=1 |
|
h264_loop_filter_start_intra |
|
|
|
sub x0, x0, #4 |
|
ld1 {v4.8b}, [x0], x1 |
|
ld1 {v5.8b}, [x0], x1 |
|
ld1 {v6.8b}, [x0], x1 |
|
ld1 {v7.8b}, [x0], x1 |
|
ld1 {v0.8b}, [x0], x1 |
|
ld1 {v1.8b}, [x0], x1 |
|
ld1 {v2.8b}, [x0], x1 |
|
ld1 {v3.8b}, [x0], x1 |
|
ld1 {v4.d}[1], [x0], x1 |
|
ld1 {v5.d}[1], [x0], x1 |
|
ld1 {v6.d}[1], [x0], x1 |
|
ld1 {v7.d}[1], [x0], x1 |
|
ld1 {v0.d}[1], [x0], x1 |
|
ld1 {v1.d}[1], [x0], x1 |
|
ld1 {v2.d}[1], [x0], x1 |
|
ld1 {v3.d}[1], [x0], x1 |
|
|
|
transpose_8x16B v4, v5, v6, v7, v0, v1, v2, v3, v21, v23 |
|
|
|
h264_loop_filter_luma_intra |
|
|
|
transpose_8x16B v4, v5, v6, v7, v0, v1, v2, v3, v21, v23 |
|
|
|
sub x0, x0, x1, lsl #4 |
|
st1 {v4.8b}, [x0], x1 |
|
st1 {v5.8b}, [x0], x1 |
|
st1 {v6.8b}, [x0], x1 |
|
st1 {v7.8b}, [x0], x1 |
|
st1 {v0.8b}, [x0], x1 |
|
st1 {v1.8b}, [x0], x1 |
|
st1 {v2.8b}, [x0], x1 |
|
st1 {v3.8b}, [x0], x1 |
|
st1 {v4.d}[1], [x0], x1 |
|
st1 {v5.d}[1], [x0], x1 |
|
st1 {v6.d}[1], [x0], x1 |
|
st1 {v7.d}[1], [x0], x1 |
|
st1 {v0.d}[1], [x0], x1 |
|
st1 {v1.d}[1], [x0], x1 |
|
st1 {v2.d}[1], [x0], x1 |
|
st1 {v3.d}[1], [x0], x1 |
|
9: |
|
ret |
|
endfunc |
|
|
|
.macro h264_loop_filter_chroma |
|
dup v22.8B, w2 // alpha |
|
dup v23.8B, w3 // beta |
|
uxtl v24.8H, v24.8B |
|
uabd v26.8B, v16.8B, v0.8B // abs(p0 - q0) |
|
uabd v28.8B, v18.8B, v16.8B // abs(p1 - p0) |
|
uabd v30.8B, v2.8B, v0.8B // abs(q1 - q0) |
|
cmhi v26.8B, v22.8B, v26.8B // < alpha |
|
cmhi v28.8B, v23.8B, v28.8B // < beta |
|
cmhi v30.8B, v23.8B, v30.8B // < beta |
|
uxtl v4.8H, v0.8B |
|
and v26.8B, v26.8B, v28.8B |
|
usubw v4.8H, v4.8H, v16.8B |
|
and v26.8B, v26.8B, v30.8B |
|
shl v4.8H, v4.8H, #2 |
|
mov x8, v26.d[0] |
|
sli v24.8H, v24.8H, #8 |
|
uaddw v4.8H, v4.8H, v18.8B |
|
cbz x8, 9f |
|
usubw v4.8H, v4.8H, v2.8B |
|
rshrn v4.8B, v4.8H, #3 |
|
smin v4.8B, v4.8B, v24.8B |
|
neg v25.8B, v24.8B |
|
smax v4.8B, v4.8B, v25.8B |
|
uxtl v22.8H, v0.8B |
|
and v4.8B, v4.8B, v26.8B |
|
uxtl v28.8H, v16.8B |
|
saddw v28.8H, v28.8H, v4.8B |
|
ssubw v22.8H, v22.8H, v4.8B |
|
sqxtun v16.8B, v28.8H |
|
sqxtun v0.8B, v22.8H |
|
.endm |
|
|
|
function ff_h264_v_loop_filter_chroma_neon, export=1 |
|
h264_loop_filter_start |
|
sxtw x1, w1 |
|
|
|
sub x0, x0, x1, lsl #1 |
|
ld1 {v18.8B}, [x0], x1 |
|
ld1 {v16.8B}, [x0], x1 |
|
ld1 {v0.8B}, [x0], x1 |
|
ld1 {v2.8B}, [x0] |
|
|
|
h264_loop_filter_chroma |
|
|
|
sub x0, x0, x1, lsl #1 |
|
st1 {v16.8B}, [x0], x1 |
|
st1 {v0.8B}, [x0], x1 |
|
9: |
|
ret |
|
endfunc |
|
|
|
function ff_h264_h_loop_filter_chroma_neon, export=1 |
|
h264_loop_filter_start |
|
sxtw x1, w1 |
|
|
|
sub x0, x0, #2 |
|
h_loop_filter_chroma420: |
|
ld1 {v18.S}[0], [x0], x1 |
|
ld1 {v16.S}[0], [x0], x1 |
|
ld1 {v0.S}[0], [x0], x1 |
|
ld1 {v2.S}[0], [x0], x1 |
|
ld1 {v18.S}[1], [x0], x1 |
|
ld1 {v16.S}[1], [x0], x1 |
|
ld1 {v0.S}[1], [x0], x1 |
|
ld1 {v2.S}[1], [x0], x1 |
|
|
|
transpose_4x8B v18, v16, v0, v2, v28, v29, v30, v31 |
|
|
|
h264_loop_filter_chroma |
|
|
|
transpose_4x8B v18, v16, v0, v2, v28, v29, v30, v31 |
|
|
|
sub x0, x0, x1, lsl #3 |
|
st1 {v18.S}[0], [x0], x1 |
|
st1 {v16.S}[0], [x0], x1 |
|
st1 {v0.S}[0], [x0], x1 |
|
st1 {v2.S}[0], [x0], x1 |
|
st1 {v18.S}[1], [x0], x1 |
|
st1 {v16.S}[1], [x0], x1 |
|
st1 {v0.S}[1], [x0], x1 |
|
st1 {v2.S}[1], [x0], x1 |
|
9: |
|
ret |
|
endfunc |
|
|
|
function ff_h264_h_loop_filter_chroma422_neon, export=1 |
|
sxtw x1, w1 |
|
h264_loop_filter_start |
|
add x5, x0, x1 |
|
sub x0, x0, #2 |
|
add x1, x1, x1 |
|
mov x7, x30 |
|
bl h_loop_filter_chroma420 |
|
mov x30, x7 |
|
sub x0, x5, #2 |
|
mov v24.s[0], w6 |
|
b h_loop_filter_chroma420 |
|
endfunc |
|
|
|
.macro h264_loop_filter_chroma_intra |
|
uabd v26.8b, v16.8b, v17.8b // abs(p0 - q0) |
|
uabd v27.8b, v18.8b, v16.8b // abs(p1 - p0) |
|
uabd v28.8b, v19.8b, v17.8b // abs(q1 - q0) |
|
cmhi v26.8b, v30.8b, v26.8b // < alpha |
|
cmhi v27.8b, v31.8b, v27.8b // < beta |
|
cmhi v28.8b, v31.8b, v28.8b // < beta |
|
and v26.8b, v26.8b, v27.8b |
|
and v26.8b, v26.8b, v28.8b |
|
mov x2, v26.d[0] |
|
|
|
ushll v4.8h, v18.8b, #1 |
|
ushll v6.8h, v19.8b, #1 |
|
cbz x2, 9f |
|
uaddl v20.8h, v16.8b, v19.8b |
|
uaddl v22.8h, v17.8b, v18.8b |
|
add v20.8h, v20.8h, v4.8h |
|
add v22.8h, v22.8h, v6.8h |
|
uqrshrn v24.8b, v20.8h, #2 |
|
uqrshrn v25.8b, v22.8h, #2 |
|
bit v16.8b, v24.8b, v26.8b |
|
bit v17.8b, v25.8b, v26.8b |
|
.endm |
|
|
|
function ff_h264_v_loop_filter_chroma_intra_neon, export=1 |
|
h264_loop_filter_start_intra |
|
|
|
sub x0, x0, x1, lsl #1 |
|
ld1 {v18.8b}, [x0], x1 |
|
ld1 {v16.8b}, [x0], x1 |
|
ld1 {v17.8b}, [x0], x1 |
|
ld1 {v19.8b}, [x0] |
|
|
|
h264_loop_filter_chroma_intra |
|
|
|
sub x0, x0, x1, lsl #1 |
|
st1 {v16.8b}, [x0], x1 |
|
st1 {v17.8b}, [x0], x1 |
|
|
|
9: |
|
ret |
|
endfunc |
|
|
|
function ff_h264_h_loop_filter_chroma_mbaff_intra_neon, export=1 |
|
h264_loop_filter_start_intra |
|
|
|
sub x4, x0, #2 |
|
sub x0, x0, #1 |
|
ld1 {v18.8b}, [x4], x1 |
|
ld1 {v16.8b}, [x4], x1 |
|
ld1 {v17.8b}, [x4], x1 |
|
ld1 {v19.8b}, [x4], x1 |
|
|
|
transpose_4x8B v18, v16, v17, v19, v26, v27, v28, v29 |
|
|
|
h264_loop_filter_chroma_intra |
|
|
|
st2 {v16.b,v17.b}[0], [x0], x1 |
|
st2 {v16.b,v17.b}[1], [x0], x1 |
|
st2 {v16.b,v17.b}[2], [x0], x1 |
|
st2 {v16.b,v17.b}[3], [x0], x1 |
|
|
|
9: |
|
ret |
|
endfunc |
|
|
|
function ff_h264_h_loop_filter_chroma_intra_neon, export=1 |
|
h264_loop_filter_start_intra |
|
|
|
sub x4, x0, #2 |
|
sub x0, x0, #1 |
|
h_loop_filter_chroma420_intra: |
|
ld1 {v18.8b}, [x4], x1 |
|
ld1 {v16.8b}, [x4], x1 |
|
ld1 {v17.8b}, [x4], x1 |
|
ld1 {v19.8b}, [x4], x1 |
|
ld1 {v18.s}[1], [x4], x1 |
|
ld1 {v16.s}[1], [x4], x1 |
|
ld1 {v17.s}[1], [x4], x1 |
|
ld1 {v19.s}[1], [x4], x1 |
|
|
|
transpose_4x8B v18, v16, v17, v19, v26, v27, v28, v29 |
|
|
|
h264_loop_filter_chroma_intra |
|
|
|
st2 {v16.b,v17.b}[0], [x0], x1 |
|
st2 {v16.b,v17.b}[1], [x0], x1 |
|
st2 {v16.b,v17.b}[2], [x0], x1 |
|
st2 {v16.b,v17.b}[3], [x0], x1 |
|
st2 {v16.b,v17.b}[4], [x0], x1 |
|
st2 {v16.b,v17.b}[5], [x0], x1 |
|
st2 {v16.b,v17.b}[6], [x0], x1 |
|
st2 {v16.b,v17.b}[7], [x0], x1 |
|
|
|
9: |
|
ret |
|
endfunc |
|
|
|
function ff_h264_h_loop_filter_chroma422_intra_neon, export=1 |
|
h264_loop_filter_start_intra |
|
sub x4, x0, #2 |
|
add x5, x0, x1, lsl #3 |
|
sub x0, x0, #1 |
|
mov x7, x30 |
|
bl h_loop_filter_chroma420_intra |
|
sub x0, x5, #1 |
|
mov x30, x7 |
|
b h_loop_filter_chroma420_intra |
|
endfunc |
|
|
|
.macro biweight_16 macs, macd |
|
dup v0.16B, w5 |
|
dup v1.16B, w6 |
|
mov v4.16B, v16.16B |
|
mov v6.16B, v16.16B |
|
1: subs w3, w3, #2 |
|
ld1 {v20.16B}, [x0], x2 |
|
\macd v4.8H, v0.8B, v20.8B |
|
\macd\()2 v6.8H, v0.16B, v20.16B |
|
ld1 {v22.16B}, [x1], x2 |
|
\macs v4.8H, v1.8B, v22.8B |
|
\macs\()2 v6.8H, v1.16B, v22.16B |
|
mov v24.16B, v16.16B |
|
ld1 {v28.16B}, [x0], x2 |
|
mov v26.16B, v16.16B |
|
\macd v24.8H, v0.8B, v28.8B |
|
\macd\()2 v26.8H, v0.16B, v28.16B |
|
ld1 {v30.16B}, [x1], x2 |
|
\macs v24.8H, v1.8B, v30.8B |
|
\macs\()2 v26.8H, v1.16B, v30.16B |
|
sshl v4.8H, v4.8H, v18.8H |
|
sshl v6.8H, v6.8H, v18.8H |
|
sqxtun v4.8B, v4.8H |
|
sqxtun2 v4.16B, v6.8H |
|
sshl v24.8H, v24.8H, v18.8H |
|
sshl v26.8H, v26.8H, v18.8H |
|
sqxtun v24.8B, v24.8H |
|
sqxtun2 v24.16B, v26.8H |
|
mov v6.16B, v16.16B |
|
st1 {v4.16B}, [x7], x2 |
|
mov v4.16B, v16.16B |
|
st1 {v24.16B}, [x7], x2 |
|
b.ne 1b |
|
ret |
|
.endm |
|
|
|
.macro biweight_8 macs, macd |
|
dup v0.8B, w5 |
|
dup v1.8B, w6 |
|
mov v2.16B, v16.16B |
|
mov v20.16B, v16.16B |
|
1: subs w3, w3, #2 |
|
ld1 {v4.8B}, [x0], x2 |
|
\macd v2.8H, v0.8B, v4.8B |
|
ld1 {v5.8B}, [x1], x2 |
|
\macs v2.8H, v1.8B, v5.8B |
|
ld1 {v6.8B}, [x0], x2 |
|
\macd v20.8H, v0.8B, v6.8B |
|
ld1 {v7.8B}, [x1], x2 |
|
\macs v20.8H, v1.8B, v7.8B |
|
sshl v2.8H, v2.8H, v18.8H |
|
sqxtun v2.8B, v2.8H |
|
sshl v20.8H, v20.8H, v18.8H |
|
sqxtun v4.8B, v20.8H |
|
mov v20.16B, v16.16B |
|
st1 {v2.8B}, [x7], x2 |
|
mov v2.16B, v16.16B |
|
st1 {v4.8B}, [x7], x2 |
|
b.ne 1b |
|
ret |
|
.endm |
|
|
|
.macro biweight_4 macs, macd |
|
dup v0.8B, w5 |
|
dup v1.8B, w6 |
|
mov v2.16B, v16.16B |
|
mov v20.16B,v16.16B |
|
1: subs w3, w3, #4 |
|
ld1 {v4.S}[0], [x0], x2 |
|
ld1 {v4.S}[1], [x0], x2 |
|
\macd v2.8H, v0.8B, v4.8B |
|
ld1 {v5.S}[0], [x1], x2 |
|
ld1 {v5.S}[1], [x1], x2 |
|
\macs v2.8H, v1.8B, v5.8B |
|
b.lt 2f |
|
ld1 {v6.S}[0], [x0], x2 |
|
ld1 {v6.S}[1], [x0], x2 |
|
\macd v20.8H, v0.8B, v6.8B |
|
ld1 {v7.S}[0], [x1], x2 |
|
ld1 {v7.S}[1], [x1], x2 |
|
\macs v20.8H, v1.8B, v7.8B |
|
sshl v2.8H, v2.8H, v18.8H |
|
sqxtun v2.8B, v2.8H |
|
sshl v20.8H, v20.8H, v18.8H |
|
sqxtun v4.8B, v20.8H |
|
mov v20.16B, v16.16B |
|
st1 {v2.S}[0], [x7], x2 |
|
st1 {v2.S}[1], [x7], x2 |
|
mov v2.16B, v16.16B |
|
st1 {v4.S}[0], [x7], x2 |
|
st1 {v4.S}[1], [x7], x2 |
|
b.ne 1b |
|
ret |
|
2: sshl v2.8H, v2.8H, v18.8H |
|
sqxtun v2.8B, v2.8H |
|
st1 {v2.S}[0], [x7], x2 |
|
st1 {v2.S}[1], [x7], x2 |
|
ret |
|
.endm |
|
|
|
.macro biweight_func w |
|
function ff_biweight_h264_pixels_\w\()_neon, export=1 |
|
sxtw x2, w2 |
|
lsr w8, w5, #31 |
|
add w7, w7, #1 |
|
eor w8, w8, w6, lsr #30 |
|
orr w7, w7, #1 |
|
dup v18.8H, w4 |
|
lsl w7, w7, w4 |
|
not v18.16B, v18.16B |
|
dup v16.8H, w7 |
|
mov x7, x0 |
|
cbz w8, 10f |
|
subs w8, w8, #1 |
|
b.eq 20f |
|
subs w8, w8, #1 |
|
b.eq 30f |
|
b 40f |
|
10: biweight_\w umlal, umlal |
|
20: neg w5, w5 |
|
biweight_\w umlal, umlsl |
|
30: neg w5, w5 |
|
neg w6, w6 |
|
biweight_\w umlsl, umlsl |
|
40: neg w6, w6 |
|
biweight_\w umlsl, umlal |
|
endfunc |
|
.endm |
|
|
|
biweight_func 16 |
|
biweight_func 8 |
|
biweight_func 4 |
|
|
|
.macro weight_16 add |
|
dup v0.16B, w4 |
|
1: subs w2, w2, #2 |
|
ld1 {v20.16B}, [x0], x1 |
|
umull v4.8H, v0.8B, v20.8B |
|
umull2 v6.8H, v0.16B, v20.16B |
|
ld1 {v28.16B}, [x0], x1 |
|
umull v24.8H, v0.8B, v28.8B |
|
umull2 v26.8H, v0.16B, v28.16B |
|
\add v4.8H, v16.8H, v4.8H |
|
srshl v4.8H, v4.8H, v18.8H |
|
\add v6.8H, v16.8H, v6.8H |
|
srshl v6.8H, v6.8H, v18.8H |
|
sqxtun v4.8B, v4.8H |
|
sqxtun2 v4.16B, v6.8H |
|
\add v24.8H, v16.8H, v24.8H |
|
srshl v24.8H, v24.8H, v18.8H |
|
\add v26.8H, v16.8H, v26.8H |
|
srshl v26.8H, v26.8H, v18.8H |
|
sqxtun v24.8B, v24.8H |
|
sqxtun2 v24.16B, v26.8H |
|
st1 {v4.16B}, [x5], x1 |
|
st1 {v24.16B}, [x5], x1 |
|
b.ne 1b |
|
ret |
|
.endm |
|
|
|
.macro weight_8 add |
|
dup v0.8B, w4 |
|
1: subs w2, w2, #2 |
|
ld1 {v4.8B}, [x0], x1 |
|
umull v2.8H, v0.8B, v4.8B |
|
ld1 {v6.8B}, [x0], x1 |
|
umull v20.8H, v0.8B, v6.8B |
|
\add v2.8H, v16.8H, v2.8H |
|
srshl v2.8H, v2.8H, v18.8H |
|
sqxtun v2.8B, v2.8H |
|
\add v20.8H, v16.8H, v20.8H |
|
srshl v20.8H, v20.8H, v18.8H |
|
sqxtun v4.8B, v20.8H |
|
st1 {v2.8B}, [x5], x1 |
|
st1 {v4.8B}, [x5], x1 |
|
b.ne 1b |
|
ret |
|
.endm |
|
|
|
.macro weight_4 add |
|
dup v0.8B, w4 |
|
1: subs w2, w2, #4 |
|
ld1 {v4.S}[0], [x0], x1 |
|
ld1 {v4.S}[1], [x0], x1 |
|
umull v2.8H, v0.8B, v4.8B |
|
b.lt 2f |
|
ld1 {v6.S}[0], [x0], x1 |
|
ld1 {v6.S}[1], [x0], x1 |
|
umull v20.8H, v0.8B, v6.8B |
|
\add v2.8H, v16.8H, v2.8H |
|
srshl v2.8H, v2.8H, v18.8H |
|
sqxtun v2.8B, v2.8H |
|
\add v20.8H, v16.8H, v20.8H |
|
srshl v20.8H, v20.8h, v18.8H |
|
sqxtun v4.8B, v20.8H |
|
st1 {v2.S}[0], [x5], x1 |
|
st1 {v2.S}[1], [x5], x1 |
|
st1 {v4.S}[0], [x5], x1 |
|
st1 {v4.S}[1], [x5], x1 |
|
b.ne 1b |
|
ret |
|
2: \add v2.8H, v16.8H, v2.8H |
|
srshl v2.8H, v2.8H, v18.8H |
|
sqxtun v2.8B, v2.8H |
|
st1 {v2.S}[0], [x5], x1 |
|
st1 {v2.S}[1], [x5], x1 |
|
ret |
|
.endm |
|
|
|
.macro weight_func w |
|
function ff_weight_h264_pixels_\w\()_neon, export=1 |
|
sxtw x1, w1 |
|
cmp w3, #1 |
|
mov w6, #1 |
|
lsl w5, w5, w3 |
|
dup v16.8H, w5 |
|
mov x5, x0 |
|
b.le 20f |
|
sub w6, w6, w3 |
|
dup v18.8H, w6 |
|
cmp w4, #0 |
|
b.lt 10f |
|
weight_\w shadd |
|
10: neg w4, w4 |
|
weight_\w shsub |
|
20: neg w6, w3 |
|
dup v18.8H, w6 |
|
cmp w4, #0 |
|
b.lt 10f |
|
weight_\w add |
|
10: neg w4, w4 |
|
weight_\w sub |
|
endfunc |
|
.endm |
|
|
|
weight_func 16 |
|
weight_func 8 |
|
weight_func 4
|
|
|