aarch64/vvc: Use faster clip operation

Replace sqxtn+smin+smax by sqxtun+umin.
pull/391/head
Zhao Zhili 2 months ago committed by Nuo Mi
parent bfed5f6b7d
commit 91436638de
  1. 41
      libavcodec/aarch64/vvc/inter.S

@ -36,13 +36,13 @@
.ifc \type, avg
saddl v4.4s, v0.4h, v2.4h
add v4.4s, v4.4s, v16.4s
sqshrn v4.4h, v4.4s, #(15 - \bit_depth)
sqshrun v4.4h, v4.4s, #(15 - \bit_depth)
.else
mov v4.16b, v16.16b
smlal v4.4s, v0.4h, v19.4h
smlal v4.4s, v2.4h, v20.4h
sqshl v4.4s, v4.4s, v22.4s
sqxtn v4.4h, v4.4s
sqxtun v4.4h, v4.4s
.endif
.if \bit_depth == 8
@ -54,8 +54,7 @@
.endif
.else // bit_depth > 8
smin v4.4h, v4.4h, v17.4h
smax v4.4h, v4.4h, v18.4h
umin v4.4h, v4.4h, v17.4h
.if \tap == 2
str s4, [dst]
.else
@ -95,7 +94,6 @@ function ff_vvc_\type\()_\bit_depth\()_neon, export=1
.if \bit_depth >= 10
// clip pixel
mov w6, #((1 << \bit_depth) - 1)
movi v18.8h, #0
dup v17.8h, w6
.endif
@ -121,8 +119,8 @@ function ff_vvc_\type\()_\bit_depth\()_neon, export=1
saddl2 v5.4s, v0.8h, v2.8h
add v4.4s, v4.4s, v16.4s
add v5.4s, v5.4s, v16.4s
sqshrn v4.4h, v4.4s, #(15 - \bit_depth)
sqshrn2 v4.8h, v5.4s, #(15 - \bit_depth)
sqshrun v4.4h, v4.4s, #(15 - \bit_depth)
sqshrun2 v4.8h, v5.4s, #(15 - \bit_depth)
.else
mov v4.16b, v16.16b
mov v5.16b, v16.16b
@ -132,16 +130,15 @@ function ff_vvc_\type\()_\bit_depth\()_neon, export=1
smlal2 v5.4s, v2.8h, v20.8h
sqshl v4.4s, v4.4s, v22.4s
sqshl v5.4s, v5.4s, v22.4s
sqxtn v4.4h, v4.4s
sqxtn2 v4.8h, v5.4s
sqxtun v4.4h, v4.4s
sqxtun2 v4.8h, v5.4s
.endif
subs height, height, #1
.if \bit_depth == 8
sqxtun v4.8b, v4.8h
st1 {v4.8b}, [dst], dst_stride
.else
smin v4.8h, v4.8h, v17.8h
smax v4.8h, v4.8h, v18.8h
umin v4.8h, v4.8h, v17.8h
st1 {v4.8h}, [dst], dst_stride
.endif
b.ne 8b
@ -163,10 +160,10 @@ function ff_vvc_\type\()_\bit_depth\()_neon, export=1
add v5.4s, v5.4s, v16.4s
add v6.4s, v6.4s, v16.4s
add v7.4s, v7.4s, v16.4s
sqshrn v4.4h, v4.4s, #(15 - \bit_depth)
sqshrn2 v4.8h, v5.4s, #(15 - \bit_depth)
sqshrn v6.4h, v6.4s, #(15 - \bit_depth)
sqshrn2 v6.8h, v7.4s, #(15 - \bit_depth)
sqshrun v4.4h, v4.4s, #(15 - \bit_depth)
sqshrun2 v4.8h, v5.4s, #(15 - \bit_depth)
sqshrun v6.4h, v6.4s, #(15 - \bit_depth)
sqshrun2 v6.8h, v7.4s, #(15 - \bit_depth)
.else // avg
mov v4.16b, v16.16b
mov v5.16b, v16.16b
@ -184,10 +181,10 @@ function ff_vvc_\type\()_\bit_depth\()_neon, export=1
sqshl v5.4s, v5.4s, v22.4s
sqshl v6.4s, v6.4s, v22.4s
sqshl v7.4s, v7.4s, v22.4s
sqxtn v4.4h, v4.4s
sqxtn v6.4h, v6.4s
sqxtn2 v4.8h, v5.4s
sqxtn2 v6.8h, v7.4s
sqxtun v4.4h, v4.4s
sqxtun v6.4h, v6.4s
sqxtun2 v4.8h, v5.4s
sqxtun2 v6.8h, v7.4s
.endif // w_avg
subs w6, w6, #16
.if \bit_depth == 8
@ -195,10 +192,8 @@ function ff_vvc_\type\()_\bit_depth\()_neon, export=1
sqxtun2 v4.16b, v6.8h
str q4, [x9], #16
.else
smin v4.8h, v4.8h, v17.8h
smin v6.8h, v6.8h, v17.8h
smax v4.8h, v4.8h, v18.8h
smax v6.8h, v6.8h, v18.8h
umin v4.8h, v4.8h, v17.8h
umin v6.8h, v6.8h, v17.8h
stp q4, q6, [x9], #32
.endif
b.ne 17b

Loading…
Cancel
Save