From 91436638defe83f4241ead35dd9192b1c0ebfe18 Mon Sep 17 00:00:00 2001
From: Zhao Zhili <zhilizhao@tencent.com>
Date: Tue, 10 Dec 2024 12:19:02 +0800
Subject: [PATCH] aarch64/vvc: Use faster clip operation

Replace sqxtn+smin+smax by sqxtun+umin.
---
 libavcodec/aarch64/vvc/inter.S | 41 +++++++++++++++-------------------
 1 file changed, 18 insertions(+), 23 deletions(-)

diff --git a/libavcodec/aarch64/vvc/inter.S b/libavcodec/aarch64/vvc/inter.S
index b6b079b569..7a752019ee 100644
--- a/libavcodec/aarch64/vvc/inter.S
+++ b/libavcodec/aarch64/vvc/inter.S
@@ -36,13 +36,13 @@
 .ifc \type, avg
         saddl           v4.4s, v0.4h, v2.4h
         add             v4.4s, v4.4s, v16.4s
-        sqshrn          v4.4h, v4.4s, #(15 - \bit_depth)
+        sqshrun         v4.4h, v4.4s, #(15 - \bit_depth)
 .else
         mov             v4.16b, v16.16b
         smlal           v4.4s, v0.4h, v19.4h
         smlal           v4.4s, v2.4h, v20.4h
         sqshl           v4.4s, v4.4s, v22.4s
-        sqxtn           v4.4h, v4.4s
+        sqxtun          v4.4h, v4.4s
 .endif
 
 .if \bit_depth == 8
@@ -54,8 +54,7 @@
 .endif
 
 .else   // bit_depth > 8
-        smin            v4.4h, v4.4h, v17.4h
-        smax            v4.4h, v4.4h, v18.4h
+        umin            v4.4h, v4.4h, v17.4h
 .if \tap == 2
         str             s4, [dst]
 .else
@@ -95,7 +94,6 @@ function ff_vvc_\type\()_\bit_depth\()_neon, export=1
  .if \bit_depth >= 10
         // clip pixel
         mov             w6, #((1 << \bit_depth) - 1)
-        movi            v18.8h, #0
         dup             v17.8h, w6
 .endif
 
@@ -121,8 +119,8 @@ function ff_vvc_\type\()_\bit_depth\()_neon, export=1
         saddl2          v5.4s, v0.8h, v2.8h
         add             v4.4s, v4.4s, v16.4s
         add             v5.4s, v5.4s, v16.4s
-        sqshrn          v4.4h, v4.4s, #(15 - \bit_depth)
-        sqshrn2         v4.8h, v5.4s, #(15 - \bit_depth)
+        sqshrun         v4.4h, v4.4s, #(15 - \bit_depth)
+        sqshrun2        v4.8h, v5.4s, #(15 - \bit_depth)
 .else
         mov             v4.16b, v16.16b
         mov             v5.16b, v16.16b
@@ -132,16 +130,15 @@ function ff_vvc_\type\()_\bit_depth\()_neon, export=1
         smlal2          v5.4s, v2.8h, v20.8h
         sqshl           v4.4s, v4.4s, v22.4s
         sqshl           v5.4s, v5.4s, v22.4s
-        sqxtn           v4.4h, v4.4s
-        sqxtn2          v4.8h, v5.4s
+        sqxtun          v4.4h, v4.4s
+        sqxtun2         v4.8h, v5.4s
 .endif
         subs            height, height, #1
 .if \bit_depth == 8
         sqxtun          v4.8b, v4.8h
         st1             {v4.8b}, [dst], dst_stride
 .else
-        smin            v4.8h, v4.8h, v17.8h
-        smax            v4.8h, v4.8h, v18.8h
+        umin            v4.8h, v4.8h, v17.8h
         st1             {v4.8h}, [dst], dst_stride
 .endif
         b.ne            8b
@@ -163,10 +160,10 @@ function ff_vvc_\type\()_\bit_depth\()_neon, export=1
         add             v5.4s, v5.4s, v16.4s
         add             v6.4s, v6.4s, v16.4s
         add             v7.4s, v7.4s, v16.4s
-        sqshrn          v4.4h, v4.4s, #(15 - \bit_depth)
-        sqshrn2         v4.8h, v5.4s, #(15 - \bit_depth)
-        sqshrn          v6.4h, v6.4s, #(15 - \bit_depth)
-        sqshrn2         v6.8h, v7.4s, #(15 - \bit_depth)
+        sqshrun         v4.4h, v4.4s, #(15 - \bit_depth)
+        sqshrun2        v4.8h, v5.4s, #(15 - \bit_depth)
+        sqshrun         v6.4h, v6.4s, #(15 - \bit_depth)
+        sqshrun2        v6.8h, v7.4s, #(15 - \bit_depth)
 .else   // avg
         mov             v4.16b, v16.16b
         mov             v5.16b, v16.16b
@@ -184,10 +181,10 @@ function ff_vvc_\type\()_\bit_depth\()_neon, export=1
         sqshl           v5.4s, v5.4s, v22.4s
         sqshl           v6.4s, v6.4s, v22.4s
         sqshl           v7.4s, v7.4s, v22.4s
-        sqxtn           v4.4h, v4.4s
-        sqxtn           v6.4h, v6.4s
-        sqxtn2          v4.8h, v5.4s
-        sqxtn2          v6.8h, v7.4s
+        sqxtun          v4.4h, v4.4s
+        sqxtun          v6.4h, v6.4s
+        sqxtun2         v4.8h, v5.4s
+        sqxtun2         v6.8h, v7.4s
 .endif  // w_avg
         subs            w6, w6, #16
 .if \bit_depth == 8
@@ -195,10 +192,8 @@ function ff_vvc_\type\()_\bit_depth\()_neon, export=1
         sqxtun2         v4.16b, v6.8h
         str             q4, [x9], #16
 .else
-        smin            v4.8h, v4.8h, v17.8h
-        smin            v6.8h, v6.8h, v17.8h
-        smax            v4.8h, v4.8h, v18.8h
-        smax            v6.8h, v6.8h, v18.8h
+        umin            v4.8h, v4.8h, v17.8h
+        umin            v6.8h, v6.8h, v17.8h
         stp             q4, q6, [x9], #32
 .endif
         b.ne            17b