From 3a9f44d5d5dcc9a805aa8345d922c1c5a53b681a Mon Sep 17 00:00:00 2001
From: Michael Niedermayer <michaelni@gmx.at>
Date: Mon, 20 Aug 2007 22:29:21 +0000
Subject: [PATCH] and of course the unneeded double subtractions were blindly
 put in the mmx code this also makes the affected code 4% faster

Originally committed as revision 10156 to svn://svn.ffmpeg.org/ffmpeg/trunk
---
 libavcodec/i386/snowdsp_mmx.c | 33 +++++++++++++++------------------
 1 file changed, 15 insertions(+), 18 deletions(-)

diff --git a/libavcodec/i386/snowdsp_mmx.c b/libavcodec/i386/snowdsp_mmx.c
index 1d5a4f4ff7..4d40e46f8f 100644
--- a/libavcodec/i386/snowdsp_mmx.c
+++ b/libavcodec/i386/snowdsp_mmx.c
@@ -294,9 +294,10 @@ void ff_snow_horizontal_compose97i_mmx(DWTELEM *b, int width){
         DWTELEM * const ref = b+w2 - 1;
 
         i = 1;
-        b[0] = b[0] - (((-2 * ref[1] + W_BO) - 4 * b[0]) >> W_BS);
+        b[0] = b[0] + (((2 * ref[1] + W_BO-1) + 4 * b[0]) >> W_BS);
         asm volatile(
-            "pslld          $1, %%mm7       \n\t" /* xmm7 already holds a '4' from 2 lifts ago. */
+            "pcmpeqd     %%mm7, %%mm7        \n\t"
+            "psrld         $29, %%mm7        \n\t"
            ::);
         for(; i<w_l-3; i+=4){
             asm volatile(
@@ -304,22 +305,18 @@ void ff_snow_horizontal_compose97i_mmx(DWTELEM *b, int width){
                 "movq    8(%1), %%mm4        \n\t"
                 "paddd   4(%1), %%mm0        \n\t"
                 "paddd  12(%1), %%mm4        \n\t"
-                "movq    %%mm7, %%mm1        \n\t"
-                "movq    %%mm7, %%mm5        \n\t"
-                "psubd   %%mm0, %%mm1        \n\t"
-                "psubd   %%mm4, %%mm5        \n\t"
-                "movq     (%0), %%mm0        \n\t"
-                "movq    8(%0), %%mm4        \n\t"
-                "pslld      $2, %%mm0        \n\t"
-                "pslld      $2, %%mm4        \n\t"
-                "psubd   %%mm0, %%mm1        \n\t"
-                "psubd   %%mm4, %%mm5        \n\t"
-                "psrad      $4, %%mm1        \n\t"
-                "psrad      $4, %%mm5        \n\t"
-                "movq     (%0), %%mm0        \n\t"
-                "movq    8(%0), %%mm4        \n\t"
-                "psubd   %%mm1, %%mm0        \n\t"
-                "psubd   %%mm5, %%mm4        \n\t"
+                "paddd   %%mm7, %%mm0        \n\t"
+                "paddd   %%mm7, %%mm4        \n\t"
+                "psrad      $2, %%mm0        \n\t"
+                "psrad      $2, %%mm4        \n\t"
+                "movq     (%0), %%mm1        \n\t"
+                "movq    8(%0), %%mm5        \n\t"
+                "paddd   %%mm1, %%mm0        \n\t"
+                "paddd   %%mm5, %%mm4        \n\t"
+                "psrad      $2, %%mm0        \n\t"
+                "psrad      $2, %%mm4        \n\t"
+                "paddd   %%mm1, %%mm0        \n\t"
+                "paddd   %%mm5, %%mm4        \n\t"
                 "movq    %%mm0, (%0)         \n\t"
                 "movq    %%mm4, 8(%0)        \n\t"
                 :: "r"(&b[i]), "r"(&ref[i])