From 91436638defe83f4241ead35dd9192b1c0ebfe18 Mon Sep 17 00:00:00 2001 From: Zhao Zhili Date: Tue, 10 Dec 2024 12:19:02 +0800 Subject: [PATCH] aarch64/vvc: Use faster clip operation Replace sqxtn+smin+smax by sqxtun+umin. --- libavcodec/aarch64/vvc/inter.S | 41 +++++++++++++++------------------- 1 file changed, 18 insertions(+), 23 deletions(-) diff --git a/libavcodec/aarch64/vvc/inter.S b/libavcodec/aarch64/vvc/inter.S index b6b079b569..7a752019ee 100644 --- a/libavcodec/aarch64/vvc/inter.S +++ b/libavcodec/aarch64/vvc/inter.S @@ -36,13 +36,13 @@ .ifc \type, avg saddl v4.4s, v0.4h, v2.4h add v4.4s, v4.4s, v16.4s - sqshrn v4.4h, v4.4s, #(15 - \bit_depth) + sqshrun v4.4h, v4.4s, #(15 - \bit_depth) .else mov v4.16b, v16.16b smlal v4.4s, v0.4h, v19.4h smlal v4.4s, v2.4h, v20.4h sqshl v4.4s, v4.4s, v22.4s - sqxtn v4.4h, v4.4s + sqxtun v4.4h, v4.4s .endif .if \bit_depth == 8 @@ -54,8 +54,7 @@ .endif .else // bit_depth > 8 - smin v4.4h, v4.4h, v17.4h - smax v4.4h, v4.4h, v18.4h + umin v4.4h, v4.4h, v17.4h .if \tap == 2 str s4, [dst] .else @@ -95,7 +94,6 @@ function ff_vvc_\type\()_\bit_depth\()_neon, export=1 .if \bit_depth >= 10 // clip pixel mov w6, #((1 << \bit_depth) - 1) - movi v18.8h, #0 dup v17.8h, w6 .endif @@ -121,8 +119,8 @@ function ff_vvc_\type\()_\bit_depth\()_neon, export=1 saddl2 v5.4s, v0.8h, v2.8h add v4.4s, v4.4s, v16.4s add v5.4s, v5.4s, v16.4s - sqshrn v4.4h, v4.4s, #(15 - \bit_depth) - sqshrn2 v4.8h, v5.4s, #(15 - \bit_depth) + sqshrun v4.4h, v4.4s, #(15 - \bit_depth) + sqshrun2 v4.8h, v5.4s, #(15 - \bit_depth) .else mov v4.16b, v16.16b mov v5.16b, v16.16b @@ -132,16 +130,15 @@ function ff_vvc_\type\()_\bit_depth\()_neon, export=1 smlal2 v5.4s, v2.8h, v20.8h sqshl v4.4s, v4.4s, v22.4s sqshl v5.4s, v5.4s, v22.4s - sqxtn v4.4h, v4.4s - sqxtn2 v4.8h, v5.4s + sqxtun v4.4h, v4.4s + sqxtun2 v4.8h, v5.4s .endif subs height, height, #1 .if \bit_depth == 8 sqxtun v4.8b, v4.8h st1 {v4.8b}, [dst], dst_stride .else - smin v4.8h, v4.8h, v17.8h - smax v4.8h, v4.8h, v18.8h + umin v4.8h, v4.8h, v17.8h st1 {v4.8h}, [dst], dst_stride .endif b.ne 8b @@ -163,10 +160,10 @@ function ff_vvc_\type\()_\bit_depth\()_neon, export=1 add v5.4s, v5.4s, v16.4s add v6.4s, v6.4s, v16.4s add v7.4s, v7.4s, v16.4s - sqshrn v4.4h, v4.4s, #(15 - \bit_depth) - sqshrn2 v4.8h, v5.4s, #(15 - \bit_depth) - sqshrn v6.4h, v6.4s, #(15 - \bit_depth) - sqshrn2 v6.8h, v7.4s, #(15 - \bit_depth) + sqshrun v4.4h, v4.4s, #(15 - \bit_depth) + sqshrun2 v4.8h, v5.4s, #(15 - \bit_depth) + sqshrun v6.4h, v6.4s, #(15 - \bit_depth) + sqshrun2 v6.8h, v7.4s, #(15 - \bit_depth) .else // avg mov v4.16b, v16.16b mov v5.16b, v16.16b @@ -184,10 +181,10 @@ function ff_vvc_\type\()_\bit_depth\()_neon, export=1 sqshl v5.4s, v5.4s, v22.4s sqshl v6.4s, v6.4s, v22.4s sqshl v7.4s, v7.4s, v22.4s - sqxtn v4.4h, v4.4s - sqxtn v6.4h, v6.4s - sqxtn2 v4.8h, v5.4s - sqxtn2 v6.8h, v7.4s + sqxtun v4.4h, v4.4s + sqxtun v6.4h, v6.4s + sqxtun2 v4.8h, v5.4s + sqxtun2 v6.8h, v7.4s .endif // w_avg subs w6, w6, #16 .if \bit_depth == 8 @@ -195,10 +192,8 @@ function ff_vvc_\type\()_\bit_depth\()_neon, export=1 sqxtun2 v4.16b, v6.8h str q4, [x9], #16 .else - smin v4.8h, v4.8h, v17.8h - smin v6.8h, v6.8h, v17.8h - smax v4.8h, v4.8h, v18.8h - smax v6.8h, v6.8h, v18.8h + umin v4.8h, v4.8h, v17.8h + umin v6.8h, v6.8h, v17.8h stp q4, q6, [x9], #32 .endif b.ne 17b