From 21c89f3a26bb1331381b90e653277585447cfbb3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Mon, 9 Jan 2017 00:04:19 +0200
Subject: [PATCH] arm/aarch64: vp9: Fix vertical alignment
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Align the second/third operands as they usually are.

Due to the wildly varying sizes of the written out operands
in aarch64 assembly, the column alignment is usually not as clear
as in arm assembly.

This is cherrypicked from libav commit
7995ebfad12002033c73feed422a1cfc62081e8f.

Signed-off-by: Martin Storsjö <martin@martin.st>
---
 libavcodec/aarch64/vp9itxfm_neon.S | 36 +++++++++++++++---------------
 libavcodec/arm/vp9itxfm_neon.S     | 14 ++++++------
 libavcodec/arm/vp9lpf_neon.S       |  2 +-
 3 files changed, 26 insertions(+), 26 deletions(-)

diff --git a/libavcodec/aarch64/vp9itxfm_neon.S b/libavcodec/aarch64/vp9itxfm_neon.S
index 3e5da0880c..b12890f0db 100644
--- a/libavcodec/aarch64/vp9itxfm_neon.S
+++ b/libavcodec/aarch64/vp9itxfm_neon.S
@@ -380,7 +380,7 @@ function ff_vp9_\txfm1\()_\txfm2\()_8x8_add_neon, export=1
 .ifc \txfm1\()_\txfm2,idct_idct
         movrel          x4,  idct_coeffs
 .else
-        movrel          x4, iadst8_coeffs
+        movrel          x4,  iadst8_coeffs
         ld1             {v1.8h}, [x4], #16
 .endif
         ld1             {v0.8h}, [x4]
@@ -480,23 +480,23 @@ itxfm_func8x8 iadst, iadst
 
 
 function idct16x16_dc_add_neon
-        movrel          x4, idct_coeffs
+        movrel          x4,  idct_coeffs
         ld1             {v0.4h}, [x4]
 
-        movi            v1.4h, #0
+        movi            v1.4h,  #0
 
         ld1             {v2.h}[0], [x2]
-        smull           v2.4s,  v2.4h, v0.h[0]
-        rshrn           v2.4h,  v2.4s, #14
-        smull           v2.4s,  v2.4h, v0.h[0]
-        rshrn           v2.4h,  v2.4s, #14
+        smull           v2.4s,  v2.4h,  v0.h[0]
+        rshrn           v2.4h,  v2.4s,  #14
+        smull           v2.4s,  v2.4h,  v0.h[0]
+        rshrn           v2.4h,  v2.4s,  #14
         dup             v2.8h,  v2.h[0]
         st1             {v1.h}[0], [x2]
 
-        srshr           v2.8h, v2.8h, #6
+        srshr           v2.8h,  v2.8h,  #6
 
-        mov             x3, x0
-        mov             x4, #16
+        mov             x3,  x0
+        mov             x4,  #16
 1:
         // Loop to add the constant from v2 into all 16x16 outputs
         subs            x4,  x4,  #2
@@ -869,7 +869,7 @@ function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, export=1
 .ifc \txfm1,idct
         ld1             {v0.8h,v1.8h}, [x10]
 .endif
-        mov             x9, #32
+        mov             x9,  #32
 
 .ifc \txfm1\()_\txfm2,idct_idct
         cmp             w3,  #10
@@ -1046,10 +1046,10 @@ idct16_partial quarter
 idct16_partial half
 
 function idct32x32_dc_add_neon
-        movrel          x4, idct_coeffs
+        movrel          x4,  idct_coeffs
         ld1             {v0.4h}, [x4]
 
-        movi            v1.4h, #0
+        movi            v1.4h,  #0
 
         ld1             {v2.h}[0], [x2]
         smull           v2.4s,  v2.4h,  v0.h[0]
@@ -1059,10 +1059,10 @@ function idct32x32_dc_add_neon
         dup             v2.8h,  v2.h[0]
         st1             {v1.h}[0], [x2]
 
-        srshr           v0.8h, v2.8h, #6
+        srshr           v0.8h,  v2.8h,  #6
 
-        mov             x3, x0
-        mov             x4, #32
+        mov             x3,  x0
+        mov             x4,  #32
 1:
         // Loop to add the constant v0 into all 32x32 outputs
         subs            x4,  x4,  #2
@@ -1230,7 +1230,7 @@ endfunc
 // x9 = double input stride
 function idct32_1d_8x32_pass1\suffix\()_neon
         mov             x14, x30
-        movi            v2.8h, #0
+        movi            v2.8h,  #0
 
         // v16 = IN(0), v17 = IN(2) ... v31 = IN(30)
 .ifb \suffix
@@ -1295,7 +1295,7 @@ function idct32_1d_8x32_pass1\suffix\()_neon
 .endif
         add             x2,  x2,  #64
 
-        movi            v2.8h, #0
+        movi            v2.8h,  #0
         // v16 = IN(1), v17 = IN(3) ... v31 = IN(31)
 .ifb \suffix
 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
diff --git a/libavcodec/arm/vp9itxfm_neon.S b/libavcodec/arm/vp9itxfm_neon.S
index 6d4d765c28..6c09922cae 100644
--- a/libavcodec/arm/vp9itxfm_neon.S
+++ b/libavcodec/arm/vp9itxfm_neon.S
@@ -530,7 +530,7 @@ function idct16x16_dc_add_neon
         movrel          r12, idct_coeffs
         vld1.16         {d0}, [r12,:64]
 
-        vmov.i16        q2, #0
+        vmov.i16        q2,  #0
 
         vld1.16         {d16[]}, [r2,:16]
         vmull.s16       q8,  d16, d0[0]
@@ -793,7 +793,7 @@ function \txfm\()16_1d_4x16_pass1_neon
         push            {lr}
 
         mov             r12, #32
-        vmov.s16        q2, #0
+        vmov.s16        q2,  #0
 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
         vld1.16         {d\i}, [r2,:64]
         vst1.16         {d4},  [r2,:64], r12
@@ -1142,7 +1142,7 @@ function idct32x32_dc_add_neon
         movrel          r12, idct_coeffs
         vld1.16         {d0}, [r12,:64]
 
-        vmov.i16        q2, #0
+        vmov.i16        q2,  #0
 
         vld1.16         {d16[]}, [r2,:16]
         vmull.s16       q8,  d16, d0[0]
@@ -1330,7 +1330,7 @@ function idct32_1d_4x32_pass1\suffix\()_neon
 
         @ Double stride of the input, since we only read every other line
         mov             r12, #128
-        vmov.s16        d4, #0
+        vmov.s16        d4,  #0
 
         @ d16 = IN(0), d17 = IN(2) ... d31 = IN(30)
 .ifb \suffix
@@ -1394,7 +1394,7 @@ function idct32_1d_4x32_pass1\suffix\()_neon
 .endif
         add             r2,  r2,  #64
 
-        vmov.s16        d8, #0
+        vmov.s16        d8,  #0
         @ d16 = IN(1), d17 = IN(3) ... d31 = IN(31)
 .ifb \suffix
 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
@@ -1533,9 +1533,9 @@ function idct32_1d_4x32_pass2\suffix\()_neon
 .endif
         vld1.32         {d12[]},  [r0,:32], r1
         vld1.32         {d12[1]}, [r0,:32], r1
-        vrshr.s16       q4, q4, #6
+        vrshr.s16       q4,  q4,  #6
         vld1.32         {d13[]},  [r0,:32], r1
-        vrshr.s16       q5, q5, #6
+        vrshr.s16       q5,  q5,  #6
         vld1.32         {d13[1]}, [r0,:32], r1
         sub             r0,  r0,  r1, lsl #2
         vaddw.u8        q4,  q4,  d12
diff --git a/libavcodec/arm/vp9lpf_neon.S b/libavcodec/arm/vp9lpf_neon.S
index 8d44d58f32..4b3608064a 100644
--- a/libavcodec/arm/vp9lpf_neon.S
+++ b/libavcodec/arm/vp9lpf_neon.S
@@ -828,7 +828,7 @@ function ff_vp9_loop_filter_v_16_16_neon, export=1
 endfunc
 
 function vp9_loop_filter_h_16_neon
-        sub             r12,  r0,  #8
+        sub             r12, r0,  #8
         vld1.8          {d16}, [r12,:64], r1
         vld1.8          {d24}, [r0, :64], r1
         vld1.8          {d17}, [r12,:64], r1