|
|
|
@ -1,6 +1,7 @@ |
|
|
|
|
/* |
|
|
|
|
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
|
|
|
|
|
* Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
|
|
|
|
|
* Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net>
|
|
|
|
|
* |
|
|
|
|
* This file is part of FFmpeg. |
|
|
|
|
* |
|
|
|
@ -181,6 +182,203 @@ function ff_h264_h_loop_filter_luma_neon, export=1 |
|
|
|
|
ret |
|
|
|
|
endfunc |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
.macro h264_loop_filter_start_intra
|
|
|
|
|
orr w4, w2, w3 |
|
|
|
|
cbnz w4, 1f |
|
|
|
|
ret |
|
|
|
|
1: |
|
|
|
|
sxtw x1, w1 |
|
|
|
|
dup v30.16b, w2 // alpha |
|
|
|
|
dup v31.16b, w3 // beta |
|
|
|
|
.endm |
|
|
|
|
|
|
|
|
|
.macro h264_loop_filter_luma_intra
|
|
|
|
|
uabd v16.16b, v7.16b, v0.16b // abs(p0 - q0) |
|
|
|
|
uabd v17.16b, v6.16b, v7.16b // abs(p1 - p0) |
|
|
|
|
uabd v18.16b, v1.16b, v0.16b // abs(q1 - q0) |
|
|
|
|
cmhi v19.16b, v30.16b, v16.16b // < alpha |
|
|
|
|
cmhi v17.16b, v31.16b, v17.16b // < beta |
|
|
|
|
cmhi v18.16b, v31.16b, v18.16b // < beta |
|
|
|
|
|
|
|
|
|
movi v29.16b, #2 |
|
|
|
|
ushr v30.16b, v30.16b, #2 // alpha >> 2 |
|
|
|
|
add v30.16b, v30.16b, v29.16b // (alpha >> 2) + 2 |
|
|
|
|
cmhi v16.16b, v30.16b, v16.16b // < (alpha >> 2) + 2 |
|
|
|
|
|
|
|
|
|
and v19.16b, v19.16b, v17.16b |
|
|
|
|
and v19.16b, v19.16b, v18.16b |
|
|
|
|
shrn v20.8b, v19.8h, #4 |
|
|
|
|
mov x4, v20.d[0] |
|
|
|
|
cbz x4, 9f |
|
|
|
|
|
|
|
|
|
ushll v20.8h, v6.8b, #1 |
|
|
|
|
ushll v22.8h, v1.8b, #1 |
|
|
|
|
ushll2 v21.8h, v6.16b, #1 |
|
|
|
|
ushll2 v23.8h, v1.16b, #1 |
|
|
|
|
uaddw v20.8h, v20.8h, v7.8b |
|
|
|
|
uaddw v22.8h, v22.8h, v0.8b |
|
|
|
|
uaddw2 v21.8h, v21.8h, v7.16b |
|
|
|
|
uaddw2 v23.8h, v23.8h, v0.16b |
|
|
|
|
uaddw v20.8h, v20.8h, v1.8b |
|
|
|
|
uaddw v22.8h, v22.8h, v6.8b |
|
|
|
|
uaddw2 v21.8h, v21.8h, v1.16b |
|
|
|
|
uaddw2 v23.8h, v23.8h, v6.16b |
|
|
|
|
|
|
|
|
|
rshrn v24.8b, v20.8h, #2 // p0'_1 |
|
|
|
|
rshrn v25.8b, v22.8h, #2 // q0'_1 |
|
|
|
|
rshrn2 v24.16b, v21.8h, #2 // p0'_1 |
|
|
|
|
rshrn2 v25.16b, v23.8h, #2 // q0'_1 |
|
|
|
|
|
|
|
|
|
uabd v17.16b, v5.16b, v7.16b // abs(p2 - p0) |
|
|
|
|
uabd v18.16b, v2.16b, v0.16b // abs(q2 - q0) |
|
|
|
|
cmhi v17.16b, v31.16b, v17.16b // < beta |
|
|
|
|
cmhi v18.16b, v31.16b, v18.16b // < beta |
|
|
|
|
|
|
|
|
|
and v17.16b, v16.16b, v17.16b // if_2 && if_3 |
|
|
|
|
and v18.16b, v16.16b, v18.16b // if_2 && if_4 |
|
|
|
|
|
|
|
|
|
not v30.16b, v17.16b |
|
|
|
|
not v31.16b, v18.16b |
|
|
|
|
|
|
|
|
|
and v30.16b, v30.16b, v19.16b // if_1 && !(if_2 && if_3) |
|
|
|
|
and v31.16b, v31.16b, v19.16b // if_1 && !(if_2 && if_4) |
|
|
|
|
|
|
|
|
|
and v17.16b, v19.16b, v17.16b // if_1 && if_2 && if_3 |
|
|
|
|
and v18.16b, v19.16b, v18.16b // if_1 && if_2 && if_4 |
|
|
|
|
|
|
|
|
|
//calc p, v7, v6, v5, v4, v17, v7, v6, v5, v4 |
|
|
|
|
uaddl v26.8h, v5.8b, v7.8b |
|
|
|
|
uaddl2 v27.8h, v5.16b, v7.16b |
|
|
|
|
uaddw v26.8h, v26.8h, v0.8b |
|
|
|
|
uaddw2 v27.8h, v27.8h, v0.16b |
|
|
|
|
add v20.8h, v20.8h, v26.8h |
|
|
|
|
add v21.8h, v21.8h, v27.8h |
|
|
|
|
uaddw v20.8h, v20.8h, v0.8b |
|
|
|
|
uaddw2 v21.8h, v21.8h, v0.16b |
|
|
|
|
rshrn v20.8b, v20.8h, #3 // p0'_2 |
|
|
|
|
rshrn2 v20.16b, v21.8h, #3 // p0'_2 |
|
|
|
|
uaddw v26.8h, v26.8h, v6.8b |
|
|
|
|
uaddw2 v27.8h, v27.8h, v6.16b |
|
|
|
|
rshrn v21.8b, v26.8h, #2 // p1'_2 |
|
|
|
|
rshrn2 v21.16b, v27.8h, #2 // p1'_2 |
|
|
|
|
uaddl v28.8h, v4.8b, v5.8b |
|
|
|
|
uaddl2 v29.8h, v4.16b, v5.16b |
|
|
|
|
shl v28.8h, v28.8h, #1 |
|
|
|
|
shl v29.8h, v29.8h, #1 |
|
|
|
|
add v28.8h, v28.8h, v26.8h |
|
|
|
|
add v29.8h, v29.8h, v27.8h |
|
|
|
|
rshrn v19.8b, v28.8h, #3 // p2'_2 |
|
|
|
|
rshrn2 v19.16b, v29.8h, #3 // p2'_2 |
|
|
|
|
|
|
|
|
|
//calc q, v0, v1, v2, v3, v18, v0, v1, v2, v3 |
|
|
|
|
uaddl v26.8h, v2.8b, v0.8b |
|
|
|
|
uaddl2 v27.8h, v2.16b, v0.16b |
|
|
|
|
uaddw v26.8h, v26.8h, v7.8b |
|
|
|
|
uaddw2 v27.8h, v27.8h, v7.16b |
|
|
|
|
add v22.8h, v22.8h, v26.8h |
|
|
|
|
add v23.8h, v23.8h, v27.8h |
|
|
|
|
uaddw v22.8h, v22.8h, v7.8b |
|
|
|
|
uaddw2 v23.8h, v23.8h, v7.16b |
|
|
|
|
rshrn v22.8b, v22.8h, #3 // q0'_2 |
|
|
|
|
rshrn2 v22.16b, v23.8h, #3 // q0'_2 |
|
|
|
|
uaddw v26.8h, v26.8h, v1.8b |
|
|
|
|
uaddw2 v27.8h, v27.8h, v1.16b |
|
|
|
|
rshrn v23.8b, v26.8h, #2 // q1'_2 |
|
|
|
|
rshrn2 v23.16b, v27.8h, #2 // q1'_2 |
|
|
|
|
uaddl v28.8h, v2.8b, v3.8b |
|
|
|
|
uaddl2 v29.8h, v2.16b, v3.16b |
|
|
|
|
shl v28.8h, v28.8h, #1 |
|
|
|
|
shl v29.8h, v29.8h, #1 |
|
|
|
|
add v28.8h, v28.8h, v26.8h |
|
|
|
|
add v29.8h, v29.8h, v27.8h |
|
|
|
|
rshrn v26.8b, v28.8h, #3 // q2'_2 |
|
|
|
|
rshrn2 v26.16b, v29.8h, #3 // q2'_2 |
|
|
|
|
|
|
|
|
|
bit v7.16b, v24.16b, v30.16b // p0'_1 |
|
|
|
|
bit v0.16b, v25.16b, v31.16b // q0'_1 |
|
|
|
|
bit v7.16b, v20.16b, v17.16b // p0'_2 |
|
|
|
|
bit v6.16b, v21.16b, v17.16b // p1'_2 |
|
|
|
|
bit v5.16b, v19.16b, v17.16b // p2'_2 |
|
|
|
|
bit v0.16b, v22.16b, v18.16b // q0'_2 |
|
|
|
|
bit v1.16b, v23.16b, v18.16b // q1'_2 |
|
|
|
|
bit v2.16b, v26.16b, v18.16b // q2'_2 |
|
|
|
|
.endm |
|
|
|
|
|
|
|
|
|
function ff_h264_v_loop_filter_luma_intra_neon, export=1 |
|
|
|
|
h264_loop_filter_start_intra |
|
|
|
|
|
|
|
|
|
ld1 {v0.16b}, [x0], x1 // q0 |
|
|
|
|
ld1 {v1.16b}, [x0], x1 // q1 |
|
|
|
|
ld1 {v2.16b}, [x0], x1 // q2 |
|
|
|
|
ld1 {v3.16b}, [x0], x1 // q3 |
|
|
|
|
sub x0, x0, x1, lsl #3 |
|
|
|
|
ld1 {v4.16b}, [x0], x1 // p3 |
|
|
|
|
ld1 {v5.16b}, [x0], x1 // p2 |
|
|
|
|
ld1 {v6.16b}, [x0], x1 // p1 |
|
|
|
|
ld1 {v7.16b}, [x0] // p0 |
|
|
|
|
|
|
|
|
|
h264_loop_filter_luma_intra |
|
|
|
|
|
|
|
|
|
sub x0, x0, x1, lsl #1 |
|
|
|
|
st1 {v5.16b}, [x0], x1 // p2 |
|
|
|
|
st1 {v6.16b}, [x0], x1 // p1 |
|
|
|
|
st1 {v7.16b}, [x0], x1 // p0 |
|
|
|
|
st1 {v0.16b}, [x0], x1 // q0 |
|
|
|
|
st1 {v1.16b}, [x0], x1 // q1 |
|
|
|
|
st1 {v2.16b}, [x0] // q2 |
|
|
|
|
9: |
|
|
|
|
ret |
|
|
|
|
endfunc |
|
|
|
|
|
|
|
|
|
function ff_h264_h_loop_filter_luma_intra_neon, export=1 |
|
|
|
|
h264_loop_filter_start_intra |
|
|
|
|
|
|
|
|
|
sub x0, x0, #4 |
|
|
|
|
ld1 {v4.8b}, [x0], x1 |
|
|
|
|
ld1 {v5.8b}, [x0], x1 |
|
|
|
|
ld1 {v6.8b}, [x0], x1 |
|
|
|
|
ld1 {v7.8b}, [x0], x1 |
|
|
|
|
ld1 {v0.8b}, [x0], x1 |
|
|
|
|
ld1 {v1.8b}, [x0], x1 |
|
|
|
|
ld1 {v2.8b}, [x0], x1 |
|
|
|
|
ld1 {v3.8b}, [x0], x1 |
|
|
|
|
ld1 {v4.d}[1], [x0], x1 |
|
|
|
|
ld1 {v5.d}[1], [x0], x1 |
|
|
|
|
ld1 {v6.d}[1], [x0], x1 |
|
|
|
|
ld1 {v7.d}[1], [x0], x1 |
|
|
|
|
ld1 {v0.d}[1], [x0], x1 |
|
|
|
|
ld1 {v1.d}[1], [x0], x1 |
|
|
|
|
ld1 {v2.d}[1], [x0], x1 |
|
|
|
|
ld1 {v3.d}[1], [x0], x1 |
|
|
|
|
|
|
|
|
|
transpose_8x16B v4, v5, v6, v7, v0, v1, v2, v3, v21, v23 |
|
|
|
|
|
|
|
|
|
h264_loop_filter_luma_intra |
|
|
|
|
|
|
|
|
|
transpose_8x16B v4, v5, v6, v7, v0, v1, v2, v3, v21, v23 |
|
|
|
|
|
|
|
|
|
sub x0, x0, x1, lsl #4 |
|
|
|
|
st1 {v4.8b}, [x0], x1 |
|
|
|
|
st1 {v5.8b}, [x0], x1 |
|
|
|
|
st1 {v6.8b}, [x0], x1 |
|
|
|
|
st1 {v7.8b}, [x0], x1 |
|
|
|
|
st1 {v0.8b}, [x0], x1 |
|
|
|
|
st1 {v1.8b}, [x0], x1 |
|
|
|
|
st1 {v2.8b}, [x0], x1 |
|
|
|
|
st1 {v3.8b}, [x0], x1 |
|
|
|
|
st1 {v4.d}[1], [x0], x1 |
|
|
|
|
st1 {v5.d}[1], [x0], x1 |
|
|
|
|
st1 {v6.d}[1], [x0], x1 |
|
|
|
|
st1 {v7.d}[1], [x0], x1 |
|
|
|
|
st1 {v0.d}[1], [x0], x1 |
|
|
|
|
st1 {v1.d}[1], [x0], x1 |
|
|
|
|
st1 {v2.d}[1], [x0], x1 |
|
|
|
|
st1 {v3.d}[1], [x0], x1 |
|
|
|
|
9: |
|
|
|
|
ret |
|
|
|
|
endfunc |
|
|
|
|
|
|
|
|
|
.macro h264_loop_filter_chroma
|
|
|
|
|
dup v22.8B, w2 // alpha |
|
|
|
|
dup v23.8B, w3 // beta |
|
|
|
@ -266,6 +464,105 @@ function ff_h264_h_loop_filter_chroma_neon, export=1 |
|
|
|
|
ret |
|
|
|
|
endfunc |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
.macro h264_loop_filter_chroma_intra
|
|
|
|
|
uabd v26.8b, v16.8b, v17.8b // abs(p0 - q0) |
|
|
|
|
uabd v27.8b, v18.8b, v16.8b // abs(p1 - p0) |
|
|
|
|
uabd v28.8b, v19.8b, v17.8b // abs(q1 - q0) |
|
|
|
|
cmhi v26.8b, v30.8b, v26.8b // < alpha |
|
|
|
|
cmhi v27.8b, v31.8b, v27.8b // < beta |
|
|
|
|
cmhi v28.8b, v31.8b, v28.8b // < beta |
|
|
|
|
and v26.8b, v26.8b, v27.8b |
|
|
|
|
and v26.8b, v26.8b, v28.8b |
|
|
|
|
mov x2, v26.d[0] |
|
|
|
|
|
|
|
|
|
ushll v4.8h, v18.8b, #1 |
|
|
|
|
ushll v6.8h, v19.8b, #1 |
|
|
|
|
cbz x2, 9f |
|
|
|
|
uaddl v20.8h, v16.8b, v19.8b |
|
|
|
|
uaddl v22.8h, v17.8b, v18.8b |
|
|
|
|
add v20.8h, v20.8h, v4.8h |
|
|
|
|
add v22.8h, v22.8h, v6.8h |
|
|
|
|
uqrshrn v24.8b, v20.8h, #2 |
|
|
|
|
uqrshrn v25.8b, v22.8h, #2 |
|
|
|
|
bit v16.8b, v24.8b, v26.8b |
|
|
|
|
bit v17.8b, v25.8b, v26.8b |
|
|
|
|
.endm |
|
|
|
|
|
|
|
|
|
function ff_h264_v_loop_filter_chroma_intra_neon, export=1 |
|
|
|
|
h264_loop_filter_start_intra |
|
|
|
|
|
|
|
|
|
sub x0, x0, x1, lsl #1 |
|
|
|
|
ld1 {v18.8b}, [x0], x1 |
|
|
|
|
ld1 {v16.8b}, [x0], x1 |
|
|
|
|
ld1 {v17.8b}, [x0], x1 |
|
|
|
|
ld1 {v19.8b}, [x0] |
|
|
|
|
|
|
|
|
|
h264_loop_filter_chroma_intra |
|
|
|
|
|
|
|
|
|
sub x0, x0, x1, lsl #1 |
|
|
|
|
st1 {v16.8b}, [x0], x1 |
|
|
|
|
st1 {v17.8b}, [x0], x1 |
|
|
|
|
|
|
|
|
|
9: |
|
|
|
|
ret |
|
|
|
|
endfunc |
|
|
|
|
|
|
|
|
|
function ff_h264_h_loop_filter_chroma_mbaff_intra_neon, export=1 |
|
|
|
|
h264_loop_filter_start_intra |
|
|
|
|
|
|
|
|
|
sub x4, x0, #2 |
|
|
|
|
sub x0, x0, #1 |
|
|
|
|
ld1 {v18.8b}, [x4], x1 |
|
|
|
|
ld1 {v16.8b}, [x4], x1 |
|
|
|
|
ld1 {v17.8b}, [x4], x1 |
|
|
|
|
ld1 {v19.8b}, [x4], x1 |
|
|
|
|
|
|
|
|
|
transpose_4x8B v18, v16, v17, v19, v26, v27, v28, v29 |
|
|
|
|
|
|
|
|
|
h264_loop_filter_chroma_intra |
|
|
|
|
|
|
|
|
|
st2 {v16.b,v17.b}[0], [x0], x1 |
|
|
|
|
st2 {v16.b,v17.b}[1], [x0], x1 |
|
|
|
|
st2 {v16.b,v17.b}[2], [x0], x1 |
|
|
|
|
st2 {v16.b,v17.b}[3], [x0], x1 |
|
|
|
|
|
|
|
|
|
9: |
|
|
|
|
ret |
|
|
|
|
endfunc |
|
|
|
|
|
|
|
|
|
function ff_h264_h_loop_filter_chroma_intra_neon, export=1 |
|
|
|
|
h264_loop_filter_start_intra |
|
|
|
|
|
|
|
|
|
sub x4, x0, #2 |
|
|
|
|
sub x0, x0, #1 |
|
|
|
|
ld1 {v18.8b}, [x4], x1 |
|
|
|
|
ld1 {v16.8b}, [x4], x1 |
|
|
|
|
ld1 {v17.8b}, [x4], x1 |
|
|
|
|
ld1 {v19.8b}, [x4], x1 |
|
|
|
|
ld1 {v18.s}[1], [x4], x1 |
|
|
|
|
ld1 {v16.s}[1], [x4], x1 |
|
|
|
|
ld1 {v17.s}[1], [x4], x1 |
|
|
|
|
ld1 {v19.s}[1], [x4] |
|
|
|
|
|
|
|
|
|
transpose_4x8B v18, v16, v17, v19, v26, v27, v28, v29 |
|
|
|
|
|
|
|
|
|
h264_loop_filter_chroma_intra |
|
|
|
|
|
|
|
|
|
st2 {v16.b,v17.b}[0], [x0], x1 |
|
|
|
|
st2 {v16.b,v17.b}[1], [x0], x1 |
|
|
|
|
st2 {v16.b,v17.b}[2], [x0], x1 |
|
|
|
|
st2 {v16.b,v17.b}[3], [x0], x1 |
|
|
|
|
st2 {v16.b,v17.b}[4], [x0], x1 |
|
|
|
|
st2 {v16.b,v17.b}[5], [x0], x1 |
|
|
|
|
st2 {v16.b,v17.b}[6], [x0], x1 |
|
|
|
|
st2 {v16.b,v17.b}[7], [x0], x1 |
|
|
|
|
|
|
|
|
|
9: |
|
|
|
|
ret |
|
|
|
|
endfunc |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
.macro biweight_16 macs, macd |
|
|
|
|
dup v0.16B, w5 |
|
|
|
|
dup v1.16B, w6 |
|
|
|
|