aarch64/vvc: Use faster clip operation

Replace sqxtn+smin+smax by sqxtun+umin.
2 months ago · 91436638de
parent bfed5f6b7d
commit 91436638de
1 changed files with 18 additions and 23 deletions
--- a/libavcodec/aarch64/vvc/inter.S
+++ b/libavcodec/aarch64/vvc/inter.S
@ -36,13 +36,13 @@
 .ifc \type, avg
        saddl           v4.4s, v0.4h, v2.4h
        add             v4.4s, v4.4s, v16.4s
-        sqshrn          v4.4h, v4.4s, #(15 - \bit_depth)
+        sqshrun         v4.4h, v4.4s, #(15 - \bit_depth)
 .else
        mov             v4.16b, v16.16b
        smlal           v4.4s, v0.4h, v19.4h
        smlal           v4.4s, v2.4h, v20.4h
        sqshl           v4.4s, v4.4s, v22.4s
-        sqxtn           v4.4h, v4.4s
+        sqxtun          v4.4h, v4.4s
 .endif

 .if \bit_depth == 8
@ -54,8 +54,7 @@
 .endif

 .else   // bit_depth > 8
-        smin            v4.4h, v4.4h, v17.4h
-        smax            v4.4h, v4.4h, v18.4h
+        umin            v4.4h, v4.4h, v17.4h
 .if \tap == 2
        str             s4, [dst]
 .else
@ -95,7 +94,6 @@ function ff_vvc_\type\()_\bit_depth\()_neon, export=1
 .if \bit_depth >= 10
        // clip pixel
        mov             w6, #((1 << \bit_depth) - 1)
-        movi            v18.8h, #0
        dup             v17.8h, w6
 .endif

@ -121,8 +119,8 @@ function ff_vvc_\type\()_\bit_depth\()_neon, export=1
        saddl2          v5.4s, v0.8h, v2.8h
        add             v4.4s, v4.4s, v16.4s
        add             v5.4s, v5.4s, v16.4s
-        sqshrn          v4.4h, v4.4s, #(15 - \bit_depth)
-        sqshrn2         v4.8h, v5.4s, #(15 - \bit_depth)
+        sqshrun         v4.4h, v4.4s, #(15 - \bit_depth)
+        sqshrun2        v4.8h, v5.4s, #(15 - \bit_depth)
 .else
        mov             v4.16b, v16.16b
        mov             v5.16b, v16.16b
@ -132,16 +130,15 @@ function ff_vvc_\type\()_\bit_depth\()_neon, export=1
        smlal2          v5.4s, v2.8h, v20.8h
        sqshl           v4.4s, v4.4s, v22.4s
        sqshl           v5.4s, v5.4s, v22.4s
-        sqxtn           v4.4h, v4.4s
-        sqxtn2          v4.8h, v5.4s
+        sqxtun          v4.4h, v4.4s
+        sqxtun2         v4.8h, v5.4s
 .endif
        subs            height, height, #1
 .if \bit_depth == 8
        sqxtun          v4.8b, v4.8h
        st1             {v4.8b}, [dst], dst_stride
 .else
-        smin            v4.8h, v4.8h, v17.8h
-        smax            v4.8h, v4.8h, v18.8h
+        umin            v4.8h, v4.8h, v17.8h
        st1             {v4.8h}, [dst], dst_stride
 .endif
        b.ne            8b
@ -163,10 +160,10 @@ function ff_vvc_\type\()_\bit_depth\()_neon, export=1
        add             v5.4s, v5.4s, v16.4s
        add             v6.4s, v6.4s, v16.4s
        add             v7.4s, v7.4s, v16.4s
-        sqshrn          v4.4h, v4.4s, #(15 - \bit_depth)
-        sqshrn2         v4.8h, v5.4s, #(15 - \bit_depth)
-        sqshrn          v6.4h, v6.4s, #(15 - \bit_depth)
-        sqshrn2         v6.8h, v7.4s, #(15 - \bit_depth)
+        sqshrun         v4.4h, v4.4s, #(15 - \bit_depth)
+        sqshrun2        v4.8h, v5.4s, #(15 - \bit_depth)
+        sqshrun         v6.4h, v6.4s, #(15 - \bit_depth)
+        sqshrun2        v6.8h, v7.4s, #(15 - \bit_depth)
 .else   // avg
        mov             v4.16b, v16.16b
        mov             v5.16b, v16.16b
@ -184,10 +181,10 @@ function ff_vvc_\type\()_\bit_depth\()_neon, export=1
        sqshl           v5.4s, v5.4s, v22.4s
        sqshl           v6.4s, v6.4s, v22.4s
        sqshl           v7.4s, v7.4s, v22.4s
-        sqxtn           v4.4h, v4.4s
-        sqxtn           v6.4h, v6.4s
-        sqxtn2          v4.8h, v5.4s
-        sqxtn2          v6.8h, v7.4s
+        sqxtun          v4.4h, v4.4s
+        sqxtun          v6.4h, v6.4s
+        sqxtun2         v4.8h, v5.4s
+        sqxtun2         v6.8h, v7.4s
 .endif  // w_avg
        subs            w6, w6, #16
 .if \bit_depth == 8
@ -195,10 +192,8 @@ function ff_vvc_\type\()_\bit_depth\()_neon, export=1
        sqxtun2         v4.16b, v6.8h
        str             q4, [x9], #16
 .else
-        smin            v4.8h, v4.8h, v17.8h
-        smin            v6.8h, v6.8h, v17.8h
-        smax            v4.8h, v4.8h, v18.8h
-        smax            v6.8h, v6.8h, v18.8h
+        umin            v4.8h, v4.8h, v17.8h
+        umin            v6.8h, v6.8h, v17.8h
        stp             q4, q6, [x9], #32
 .endif
        b.ne            17b