h264/aarch64: optimize neon loop filter

Exit as soon as possible if no filtering will be done.

Improves the checkasm --bench cycle count on a Snapdragon 820e:
h264_h_loop_filter_luma_8bpp_c:      72.4 ->  72.5
h264_h_loop_filter_luma_8bpp_neon:   97.1 ->  56.3
h264_v_loop_filter_luma_8bpp_c:     174.0 -> 173.5
h264_v_loop_filter_luma_8bpp_neon:   62.9 ->  60.9
h264_h_loop_filter_chroma_8bpp_c:    30.2 ->  30.3
h264_h_loop_filter_chroma_8bpp_neon: 51.6 ->  25.7
h264_v_loop_filter_chroma_8bpp_c:    57.3 ->  57.3
h264_v_loop_filter_chroma_8bpp_neon: 28.0 ->  24.0
pull/325/head
Janne Grunau 6 years ago
parent d7f4f5c4a1
commit 846c3d6aca
  1. 33
      libavcodec/aarch64/h264dsp_neon.S

@ -54,9 +54,12 @@
uabd v17.16B, v20.16B, v16.16B // abs(p2 - p0) uabd v17.16B, v20.16B, v16.16B // abs(p2 - p0)
and v21.16B, v21.16B, v28.16B and v21.16B, v21.16B, v28.16B
uabd v19.16B, v4.16B, v0.16B // abs(q2 - q0) uabd v19.16B, v4.16B, v0.16B // abs(q2 - q0)
and v21.16B, v21.16B, v30.16B // < beta
shrn v30.8b, v21.8h, #4
mov x7, v30.d[0]
cmhi v17.16B, v22.16B, v17.16B // < beta cmhi v17.16B, v22.16B, v17.16B // < beta
and v21.16B, v21.16B, v30.16B
cmhi v19.16B, v22.16B, v19.16B // < beta cmhi v19.16B, v22.16B, v19.16B // < beta
cbz x7, 9f
and v17.16B, v17.16B, v21.16B and v17.16B, v17.16B, v21.16B
and v19.16B, v19.16B, v21.16B and v19.16B, v19.16B, v21.16B
and v24.16B, v24.16B, v21.16B and v24.16B, v24.16B, v21.16B
@ -124,7 +127,7 @@ function ff_h264_v_loop_filter_luma_neon, export=1
st1 {v16.16B}, [x0], x1 st1 {v16.16B}, [x0], x1
st1 {v0.16B}, [x0], x1 st1 {v0.16B}, [x0], x1
st1 {v19.16B}, [x0] st1 {v19.16B}, [x0]
9:
ret ret
endfunc endfunc
@ -174,32 +177,34 @@ function ff_h264_h_loop_filter_luma_neon, export=1
st1 {v16.S}[3], [x0], x1 st1 {v16.S}[3], [x0], x1
st1 {v0.S}[3], [x0], x1 st1 {v0.S}[3], [x0], x1
st1 {v19.S}[3], [x0], x1 st1 {v19.S}[3], [x0], x1
9:
ret ret
endfunc endfunc
.macro h264_loop_filter_chroma .macro h264_loop_filter_chroma
dup v22.8B, w2 // alpha dup v22.8B, w2 // alpha
dup v23.8B, w3 // beta
uxtl v24.8H, v24.8B uxtl v24.8H, v24.8B
uabd v26.8B, v16.8B, v0.8B // abs(p0 - q0) uabd v26.8B, v16.8B, v0.8B // abs(p0 - q0)
uxtl v4.8H, v0.8B
uabd v28.8B, v18.8B, v16.8B // abs(p1 - p0) uabd v28.8B, v18.8B, v16.8B // abs(p1 - p0)
uabd v30.8B, v2.8B, v0.8B // abs(q1 - q0)
cmhi v26.8B, v22.8B, v26.8B // < alpha
cmhi v28.8B, v23.8B, v28.8B // < beta
cmhi v30.8B, v23.8B, v30.8B // < beta
uxtl v4.8H, v0.8B
and v26.8B, v26.8B, v28.8B
usubw v4.8H, v4.8H, v16.8B usubw v4.8H, v4.8H, v16.8B
sli v24.8H, v24.8H, #8 and v26.8B, v26.8B, v30.8B
shl v4.8H, v4.8H, #2 shl v4.8H, v4.8H, #2
uabd v30.8B, v2.8B, v0.8B // abs(q1 - q0) mov x2, v26.d[0]
sli v24.8H, v24.8H, #8
uaddw v4.8H, v4.8H, v18.8B uaddw v4.8H, v4.8H, v18.8B
cmhi v26.8B, v22.8B, v26.8B // < alpha cbz x2, 9f
usubw v4.8H, v4.8H, v2.8B usubw v4.8H, v4.8H, v2.8B
dup v22.8B, w3 // beta
rshrn v4.8B, v4.8H, #3 rshrn v4.8B, v4.8H, #3
cmhi v28.8B, v22.8B, v28.8B // < beta
cmhi v30.8B, v22.8B, v30.8B // < beta
smin v4.8B, v4.8B, v24.8B smin v4.8B, v4.8B, v24.8B
neg v25.8B, v24.8B neg v25.8B, v24.8B
and v26.8B, v26.8B, v28.8B
smax v4.8B, v4.8B, v25.8B smax v4.8B, v4.8B, v25.8B
and v26.8B, v26.8B, v30.8B
uxtl v22.8H, v0.8B uxtl v22.8H, v0.8B
and v4.8B, v4.8B, v26.8B and v4.8B, v4.8B, v26.8B
uxtl v28.8H, v16.8B uxtl v28.8H, v16.8B
@ -224,7 +229,7 @@ function ff_h264_v_loop_filter_chroma_neon, export=1
sub x0, x0, x1, lsl #1 sub x0, x0, x1, lsl #1
st1 {v16.8B}, [x0], x1 st1 {v16.8B}, [x0], x1
st1 {v0.8B}, [x0], x1 st1 {v0.8B}, [x0], x1
9:
ret ret
endfunc endfunc
@ -257,7 +262,7 @@ function ff_h264_h_loop_filter_chroma_neon, export=1
st1 {v16.S}[1], [x0], x1 st1 {v16.S}[1], [x0], x1
st1 {v0.S}[1], [x0], x1 st1 {v0.S}[1], [x0], x1
st1 {v2.S}[1], [x0], x1 st1 {v2.S}[1], [x0], x1
9:
ret ret
endfunc endfunc

Loading…
Cancel
Save