From ba6802debf8167f8b9259e83f820dfb53c15a227 Mon Sep 17 00:00:00 2001
From: Michael Niedermayer <michaelni@gmx.at>
Date: Wed, 27 Mar 2002 21:25:22 +0000
Subject: [PATCH] 4MV motion estimation (not finished yet) SAD functions
 rewritten (8x8 support & MMX2 optimizations) HQ inter/intra decission msmpeg4
 encoding bugfix (MV where too long)

Originally committed as revision 362 to svn://svn.ffmpeg.org/ffmpeg/trunk
---
 libavcodec/avcodec.h             |   5 +
 libavcodec/dsputil.c             | 207 +++++++++----
 libavcodec/dsputil.h             |  16 +-
 libavcodec/h263.c                |  14 +-
 libavcodec/i386/dsputil_mmx.c    |  53 +++-
 libavcodec/i386/motion_est_mmx.c | 514 +++++++++++++++++++------------
 libavcodec/motion_est.c          | 340 +++++++++++++++++---
 libavcodec/mpegvideo.c           | 410 ++++++++++++++----------
 libavcodec/mpegvideo.h           |  24 +-
 9 files changed, 1081 insertions(+), 502 deletions(-)

diff --git a/libavcodec/avcodec.h b/libavcodec/avcodec.h
index 0bca2b4fda..df6a7cfab7 100644
--- a/libavcodec/avcodec.h
+++ b/libavcodec/avcodec.h
@@ -61,9 +61,14 @@ extern int motion_estimation_method;
 #define ME_X1     5
 
 /* encoding support */
+/* note not everything is supported yet */
 
 #define CODEC_FLAG_HQ     0x0001 /* high quality (non real time) encoding */
 #define CODEC_FLAG_QSCALE 0x0002 /* use fixed qscale */
+#define CODEC_FLAG_4MV    0x0004 /* 4 MV per MB allowed */
+#define CODEC_FLAG_B      0x0008 /* use B frames */
+#define CODEC_FLAG_QPEL   0x0010 /* use qpel MC */
+#define CODEC_FLAG_GMC    0x0020 /* use GMC */
 
 /* codec capabilities */
 
diff --git a/libavcodec/dsputil.c b/libavcodec/dsputil.c
index 0e698f35ce..d27687d84a 100644
--- a/libavcodec/dsputil.c
+++ b/libavcodec/dsputil.c
@@ -36,6 +36,11 @@ op_pixels_abs_func pix_abs16x16_x2;
 op_pixels_abs_func pix_abs16x16_y2;
 op_pixels_abs_func pix_abs16x16_xy2;
 
+op_pixels_abs_func pix_abs8x8;
+op_pixels_abs_func pix_abs8x8_x2;
+op_pixels_abs_func pix_abs8x8_y2;
+op_pixels_abs_func pix_abs8x8_xy2;
+
 UINT8 cropTbl[256 + 2 * MAX_NEG_CROP];
 UINT32 squareTbl[512];
 
@@ -377,14 +382,14 @@ static void qpel_h_lowpass(UINT8 *dst, UINT8 *src, int dstStride, int srcStride,
     int i;
     for(i=0; i<h; i++)
     {
-        dst[0]= cm[(((src[0]+src[1])*160 - (src[0]+src[2])*48 + (src[1]+src[3])*24 - (src[2]+src[4])*8 + r)>>8)];
-        dst[1]= cm[(((src[1]+src[2])*160 - (src[0]+src[3])*48 + (src[0]+src[4])*24 - (src[1]+src[5])*8 + r)>>8)];
-        dst[2]= cm[(((src[2]+src[3])*160 - (src[1]+src[4])*48 + (src[0]+src[5])*24 - (src[0]+src[6])*8 + r)>>8)];
-        dst[3]= cm[(((src[3]+src[4])*160 - (src[2]+src[5])*48 + (src[1]+src[6])*24 - (src[0]+src[7])*8 + r)>>8)];
-        dst[4]= cm[(((src[4]+src[5])*160 - (src[3]+src[6])*48 + (src[2]+src[7])*24 - (src[1]+src[8])*8 + r)>>8)];
-        dst[5]= cm[(((src[5]+src[6])*160 - (src[4]+src[7])*48 + (src[3]+src[8])*24 - (src[2]+src[8])*8 + r)>>8)];
-        dst[6]= cm[(((src[6]+src[7])*160 - (src[5]+src[8])*48 + (src[4]+src[8])*24 - (src[3]+src[7])*8 + r)>>8)];
-        dst[7]= cm[(((src[7]+src[8])*160 - (src[6]+src[8])*48 + (src[5]+src[7])*24 - (src[4]+src[6])*8 + r)>>8)];
+        dst[0]= cm[(((src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]) + r)>>5)];
+        dst[1]= cm[(((src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]) + r)>>5)];
+        dst[2]= cm[(((src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]) + r)>>5)];
+        dst[3]= cm[(((src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]) + r)>>5)];
+        dst[4]= cm[(((src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]) + r)>>5)];
+        dst[5]= cm[(((src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]) + r)>>5)];
+        dst[6]= cm[(((src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]) + r)>>5)];
+        dst[7]= cm[(((src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]) + r)>>5)];
         dst+=dstStride;
         src+=srcStride;
     }
@@ -405,14 +410,14 @@ static void qpel_v_lowpass(UINT8 *dst, UINT8 *src, int dstStride, int srcStride,
         const int src6= src[6*srcStride];
         const int src7= src[7*srcStride];
         const int src8= src[8*srcStride];
-        dst[0*dstStride]= cm[(((src0+src1)*160 - (src0+src2)*48 + (src1+src3)*24 - (src2+src4)*8 + r)>>8)];
-        dst[1*dstStride]= cm[(((src1+src2)*160 - (src0+src3)*48 + (src0+src4)*24 - (src1+src5)*8 + r)>>8)];
-        dst[2*dstStride]= cm[(((src2+src3)*160 - (src1+src4)*48 + (src0+src5)*24 - (src0+src6)*8 + r)>>8)];
-        dst[3*dstStride]= cm[(((src3+src4)*160 - (src2+src5)*48 + (src1+src6)*24 - (src0+src7)*8 + r)>>8)];
-        dst[4*dstStride]= cm[(((src4+src5)*160 - (src3+src6)*48 + (src2+src7)*24 - (src1+src8)*8 + r)>>8)];
-        dst[5*dstStride]= cm[(((src5+src6)*160 - (src4+src7)*48 + (src3+src8)*24 - (src2+src8)*8 + r)>>8)];
-        dst[6*dstStride]= cm[(((src6+src7)*160 - (src5+src8)*48 + (src4+src8)*24 - (src3+src7)*8 + r)>>8)];
-        dst[7*dstStride]= cm[(((src7+src8)*160 - (src6+src8)*48 + (src5+src7)*24 - (src4+src6)*8 + r)>>8)];
+        dst[0*dstStride]= cm[(((src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4) + r)>>5)];
+        dst[1*dstStride]= cm[(((src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5) + r)>>5)];
+        dst[2*dstStride]= cm[(((src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6) + r)>>5)];
+        dst[3*dstStride]= cm[(((src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7) + r)>>5)];
+        dst[4*dstStride]= cm[(((src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8) + r)>>5)];
+        dst[5*dstStride]= cm[(((src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8) + r)>>5)];
+        dst[6*dstStride]= cm[(((src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7) + r)>>5)];
+        dst[7*dstStride]= cm[(((src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6) + r)>>5)];
         dst++;
         src++;
     }
@@ -485,38 +490,38 @@ static void qpel_mc00_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcS
 static void qpel_mc10_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
 {\
     UINT8 half[64];\
-    qpel_h_lowpass(half, src, 8, srcStride, 8, 128-r);\
+    qpel_h_lowpass(half, src, 8, srcStride, 8, 16-r);\
     avg2_block(dst, src, half, dstStride, srcStride, 1-r);\
 }\
 \
 static void qpel_mc20_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
 {\
-    qpel_h_lowpass(dst, src, dstStride, srcStride, 8, 128-r);\
+    qpel_h_lowpass(dst, src, dstStride, srcStride, 8, 16-r);\
 }\
 \
 static void qpel_mc30_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
 {\
     UINT8 half[64];\
-    qpel_h_lowpass(half, src, 8, srcStride, 8, 128-r);\
+    qpel_h_lowpass(half, src, 8, srcStride, 8, 16-r);\
     avg2_block(dst, src+1, half, dstStride, srcStride, 1-r);\
 }\
 \
 static void qpel_mc01_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
 {\
     UINT8 half[64];\
-    qpel_v_lowpass(half, src, 8, srcStride, 8, 128-r);\
+    qpel_v_lowpass(half, src, 8, srcStride, 8, 16-r);\
     avg2_block(dst, src, half, dstStride, srcStride, 1-r);\
 }\
 \
 static void qpel_mc02_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
 {\
-    qpel_v_lowpass(dst, src, dstStride, srcStride, 8, 128-r);\
+    qpel_v_lowpass(dst, src, dstStride, srcStride, 8, 16-r);\
 }\
 \
 static void qpel_mc03_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
 {\
     UINT8 half[64];\
-    qpel_v_lowpass(half, src, 8, srcStride, 8, 128-r);\
+    qpel_v_lowpass(half, src, 8, srcStride, 8, 16-r);\
     avg2_block(dst, src+srcStride, half, dstStride, srcStride, 1-r);\
 }\
 static void qpel_mc11_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
@@ -524,9 +529,9 @@ static void qpel_mc11_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcS
     UINT8 halfH[72];\
     UINT8 halfV[64];\
     UINT8 halfHV[64];\
-    qpel_h_lowpass(halfH, src, 8, srcStride, 9, 128-r);\
-    qpel_v_lowpass(halfV, src, 8, srcStride, 8, 128-r);\
-    qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 128-r);\
+    qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
+    qpel_v_lowpass(halfV, src, 8, srcStride, 8, 16-r);\
+    qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
     avg4_block(dst, src, halfH, halfV, halfHV, dstStride, srcStride, 2-r);\
 }\
 static void qpel_mc31_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
@@ -534,9 +539,9 @@ static void qpel_mc31_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcS
     UINT8 halfH[72];\
     UINT8 halfV[64];\
     UINT8 halfHV[64];\
-    qpel_h_lowpass(halfH, src, 8, srcStride, 9, 128-r);\
-    qpel_v_lowpass(halfV, src+1, 8, srcStride, 8, 128-r);\
-    qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 128-r);\
+    qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
+    qpel_v_lowpass(halfV, src+1, 8, srcStride, 8, 16-r);\
+    qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
     avg4_block(dst, src+1, halfH, halfV, halfHV, dstStride, srcStride, 2-r);\
 }\
 static void qpel_mc13_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
@@ -544,9 +549,9 @@ static void qpel_mc13_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcS
     UINT8 halfH[72];\
     UINT8 halfV[64];\
     UINT8 halfHV[64];\
-    qpel_h_lowpass(halfH, src, 8, srcStride, 9, 128-r);\
-    qpel_v_lowpass(halfV, src, 8, srcStride, 8, 128-r);\
-    qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 128-r);\
+    qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
+    qpel_v_lowpass(halfV, src, 8, srcStride, 8, 16-r);\
+    qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
     avg4_block(dst, src+srcStride, halfH+8, halfV, halfHV, dstStride, srcStride, 2-r);\
 }\
 static void qpel_mc33_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
@@ -554,25 +559,25 @@ static void qpel_mc33_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcS
     UINT8 halfH[72];\
     UINT8 halfV[64];\
     UINT8 halfHV[64];\
-    qpel_h_lowpass(halfH, src, 8, srcStride, 9, 128-r);\
-    qpel_v_lowpass(halfV, src+1, 8, srcStride, 8, 128-r);\
-    qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 128-r);\
+    qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
+    qpel_v_lowpass(halfV, src+1, 8, srcStride, 8, 16-r);\
+    qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
     avg4_block(dst, src+srcStride+1, halfH+8, halfV, halfHV, dstStride, srcStride, 2-r);\
 }\
 static void qpel_mc21_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
 {\
     UINT8 halfH[72];\
     UINT8 halfHV[64];\
-    qpel_h_lowpass(halfH, src, 8, srcStride, 9, 128-r);\
-    qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 128-r);\
+    qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
+    qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
     avg2_block(dst, halfH, halfHV, dstStride, 8, 1-r);\
 }\
 static void qpel_mc23_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
 {\
     UINT8 halfH[72];\
     UINT8 halfHV[64];\
-    qpel_h_lowpass(halfH, src, 8, srcStride, 9, 128-r);\
-    qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 128-r);\
+    qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
+    qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
     avg2_block(dst, halfH+8, halfHV, dstStride, 8, 1-r);\
 }\
 static void qpel_mc12_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
@@ -580,9 +585,9 @@ static void qpel_mc12_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcS
     UINT8 halfH[72];\
     UINT8 halfV[64];\
     UINT8 halfHV[64];\
-    qpel_h_lowpass(halfH, src, 8, srcStride, 9, 128-r);\
-    qpel_v_lowpass(halfV, src, 8, srcStride, 8, 128-r);\
-    qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 128-r);\
+    qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
+    qpel_v_lowpass(halfV, src, 8, srcStride, 8, 16-r);\
+    qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
     avg2_block(dst, halfV, halfHV, dstStride, 8, 1-r);\
 }\
 static void qpel_mc32_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
@@ -590,16 +595,16 @@ static void qpel_mc32_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcS
     UINT8 halfH[72];\
     UINT8 halfV[64];\
     UINT8 halfHV[64];\
-    qpel_h_lowpass(halfH, src, 8, srcStride, 9, 128-r);\
-    qpel_v_lowpass(halfV, src+1, 8, srcStride, 8, 128-r);\
-    qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 128-r);\
+    qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
+    qpel_v_lowpass(halfV, src+1, 8, srcStride, 8, 16-r);\
+    qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
     avg2_block(dst, halfV, halfHV, dstStride, 8, 1-r);\
 }\
 static void qpel_mc22_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
 {\
     UINT8 halfH[72];\
-    qpel_h_lowpass(halfH, src, 8, srcStride, 9, 128-r);\
-    qpel_v_lowpass(dst, halfH, dstStride, 8, 8, 128-r);\
+    qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
+    qpel_v_lowpass(dst, halfH, dstStride, 8, 8, 16-r);\
 }\
 qpel_mc_func qpel_mc ## name ## _tab[16]={ \
     qpel_mc00_c ## name,                                                                   \
@@ -623,12 +628,12 @@ qpel_mc_func qpel_mc ## name ## _tab[16]={ \
 QPEL_MC(0, _rnd)
 QPEL_MC(1, _no_rnd)
 
-int pix_abs16x16_c(UINT8 *pix1, UINT8 *pix2, int line_size, int h)
+int pix_abs16x16_c(UINT8 *pix1, UINT8 *pix2, int line_size)
 {
     int s, i;
 
     s = 0;
-    for(i=0;i<h;i++) {
+    for(i=0;i<16;i++) {
         s += abs(pix1[0] - pix2[0]);
         s += abs(pix1[1] - pix2[1]);
         s += abs(pix1[2] - pix2[2]);
@@ -651,12 +656,12 @@ int pix_abs16x16_c(UINT8 *pix1, UINT8 *pix2, int line_size, int h)
     return s;
 }
 
-int pix_abs16x16_x2_c(UINT8 *pix1, UINT8 *pix2, int line_size, int h)
+int pix_abs16x16_x2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
 {
     int s, i;
 
     s = 0;
-    for(i=0;i<h;i++) {
+    for(i=0;i<16;i++) {
         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
@@ -679,13 +684,13 @@ int pix_abs16x16_x2_c(UINT8 *pix1, UINT8 *pix2, int line_size, int h)
     return s;
 }
 
-int pix_abs16x16_y2_c(UINT8 *pix1, UINT8 *pix2, int line_size, int h)
+int pix_abs16x16_y2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
 {
     int s, i;
     UINT8 *pix3 = pix2 + line_size;
 
     s = 0;
-    for(i=0;i<h;i++) {
+    for(i=0;i<16;i++) {
         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
@@ -709,13 +714,13 @@ int pix_abs16x16_y2_c(UINT8 *pix1, UINT8 *pix2, int line_size, int h)
     return s;
 }
 
-int pix_abs16x16_xy2_c(UINT8 *pix1, UINT8 *pix2, int line_size, int h)
+int pix_abs16x16_xy2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
 {
     int s, i;
     UINT8 *pix3 = pix2 + line_size;
 
     s = 0;
-    for(i=0;i<h;i++) {
+    for(i=0;i<16;i++) {
         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
@@ -739,6 +744,90 @@ int pix_abs16x16_xy2_c(UINT8 *pix1, UINT8 *pix2, int line_size, int h)
     return s;
 }
 
+int pix_abs8x8_c(UINT8 *pix1, UINT8 *pix2, int line_size)
+{
+    int s, i;
+
+    s = 0;
+    for(i=0;i<8;i++) {
+        s += abs(pix1[0] - pix2[0]);
+        s += abs(pix1[1] - pix2[1]);
+        s += abs(pix1[2] - pix2[2]);
+        s += abs(pix1[3] - pix2[3]);
+        s += abs(pix1[4] - pix2[4]);
+        s += abs(pix1[5] - pix2[5]);
+        s += abs(pix1[6] - pix2[6]);
+        s += abs(pix1[7] - pix2[7]);
+        pix1 += line_size;
+        pix2 += line_size;
+    }
+    return s;
+}
+
+int pix_abs8x8_x2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
+{
+    int s, i;
+
+    s = 0;
+    for(i=0;i<8;i++) {
+        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
+        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
+        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
+        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
+        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
+        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
+        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
+        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
+        pix1 += line_size;
+        pix2 += line_size;
+    }
+    return s;
+}
+
+int pix_abs8x8_y2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
+{
+    int s, i;
+    UINT8 *pix3 = pix2 + line_size;
+
+    s = 0;
+    for(i=0;i<8;i++) {
+        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
+        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
+        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
+        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
+        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
+        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
+        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
+        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
+        pix1 += line_size;
+        pix2 += line_size;
+        pix3 += line_size;
+    }
+    return s;
+}
+
+int pix_abs8x8_xy2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
+{
+    int s, i;
+    UINT8 *pix3 = pix2 + line_size;
+
+    s = 0;
+    for(i=0;i<8;i++) {
+        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
+        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
+        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
+        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
+        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
+        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
+        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
+        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
+        pix1 += line_size;
+        pix2 += line_size;
+        pix3 += line_size;
+    }
+    return s;
+}
+
 /* permute block according so that it corresponds to the MMX idct
    order */
 #ifdef SIMPLE_IDCT
@@ -802,10 +891,14 @@ void dsputil_init(void)
     add_pixels_clamped = add_pixels_clamped_c;
     gmc1= gmc1_c;
 
-    pix_abs16x16 = pix_abs16x16_c;
-    pix_abs16x16_x2 = pix_abs16x16_x2_c;
-    pix_abs16x16_y2 = pix_abs16x16_y2_c;
+    pix_abs16x16     = pix_abs16x16_c;
+    pix_abs16x16_x2  = pix_abs16x16_x2_c;
+    pix_abs16x16_y2  = pix_abs16x16_y2_c;
     pix_abs16x16_xy2 = pix_abs16x16_xy2_c;
+    pix_abs8x8     = pix_abs8x8_c;
+    pix_abs8x8_x2  = pix_abs8x8_x2_c;
+    pix_abs8x8_y2  = pix_abs8x8_y2_c;
+    pix_abs8x8_xy2 = pix_abs8x8_xy2_c;
     av_fdct = jpeg_fdct_ifast;
 
     use_permuted_idct = 1;
diff --git a/libavcodec/dsputil.h b/libavcodec/dsputil.h
index 8730d69be7..45c1a695a7 100644
--- a/libavcodec/dsputil.h
+++ b/libavcodec/dsputil.h
@@ -66,17 +66,21 @@ extern void (*sub_pixels_tab[4])(DCTELEM *block, const UINT8 *pixels, int line_s
 
 /* motion estimation */
 
-typedef int (*op_pixels_abs_func)(UINT8 *blk1, UINT8 *blk2, int line_size, int h);
+typedef int (*op_pixels_abs_func)(UINT8 *blk1, UINT8 *blk2, int line_size);
 
 extern op_pixels_abs_func pix_abs16x16;
 extern op_pixels_abs_func pix_abs16x16_x2;
 extern op_pixels_abs_func pix_abs16x16_y2;
 extern op_pixels_abs_func pix_abs16x16_xy2;
-
-int pix_abs16x16_c(UINT8 *blk1, UINT8 *blk2, int lx, int h);
-int pix_abs16x16_x2_c(UINT8 *blk1, UINT8 *blk2, int lx, int h);
-int pix_abs16x16_y2_c(UINT8 *blk1, UINT8 *blk2, int lx, int h);
-int pix_abs16x16_xy2_c(UINT8 *blk1, UINT8 *blk2, int lx, int h);
+extern op_pixels_abs_func pix_abs8x8;
+extern op_pixels_abs_func pix_abs8x8_x2;
+extern op_pixels_abs_func pix_abs8x8_y2;
+extern op_pixels_abs_func pix_abs8x8_xy2;
+
+int pix_abs16x16_c(UINT8 *blk1, UINT8 *blk2, int lx);
+int pix_abs16x16_x2_c(UINT8 *blk1, UINT8 *blk2, int lx);
+int pix_abs16x16_y2_c(UINT8 *blk1, UINT8 *blk2, int lx);
+int pix_abs16x16_xy2_c(UINT8 *blk1, UINT8 *blk2, int lx);
 
 static inline int block_permute_op(int j)
 {
diff --git a/libavcodec/h263.c b/libavcodec/h263.c
index c7fdf557bb..317db431a0 100644
--- a/libavcodec/h263.c
+++ b/libavcodec/h263.c
@@ -469,14 +469,8 @@ void h263_encode_mb(MpegEncContext * s,
     }
 
     /* encode each block */
-    if (s->h263_pred) {
-	for (i = 0; i < 6; i++) {
-//	    mpeg4_encode_block(s, block[i], i);
-	}
-    } else {
-	for (i = 0; i < 6; i++) {
-	    h263_encode_block(s, block[i], i);
-	}
+    for (i = 0; i < 6; i++) {
+        h263_encode_block(s, block[i], i);
     }
 }
 
@@ -778,8 +772,8 @@ void h263_encode_init(MpegEncContext *s)
     s->mv_penalty= mv_penalty; //FIXME exact table for msmpeg4 & h263p
     
     // use fcodes >1 only for mpeg4 & h263 & h263p FIXME
-    if(s->h263_plus)      s->fcode_tab= umv_fcode_tab;
-    else if(s->h263_pred) s->fcode_tab= fcode_tab;
+    if(s->h263_plus) s->fcode_tab= umv_fcode_tab;
+    else if(s->h263_pred && !s->h263_msmpeg4) s->fcode_tab= fcode_tab;
 }
 
 static void h263_encode_block(MpegEncContext * s, DCTELEM * block, int n)
diff --git a/libavcodec/i386/dsputil_mmx.c b/libavcodec/i386/dsputil_mmx.c
index 6b35d47534..09a7174126 100644
--- a/libavcodec/i386/dsputil_mmx.c
+++ b/libavcodec/i386/dsputil_mmx.c
@@ -24,19 +24,34 @@
 
 int mm_flags; /* multimedia extension flags */
 
-int pix_abs16x16_mmx(UINT8 *blk1, UINT8 *blk2, int lx, int h);
-int pix_abs16x16_sse(UINT8 *blk1, UINT8 *blk2, int lx, int h);
-int pix_abs16x16_x2_mmx(UINT8 *blk1, UINT8 *blk2, int lx, int h);
-int pix_abs16x16_y2_mmx(UINT8 *blk1, UINT8 *blk2, int lx, int h);
-int pix_abs16x16_xy2_mmx(UINT8 *blk1, UINT8 *blk2, int lx, int h);
+int pix_abs16x16_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
+int pix_abs16x16_x2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
+int pix_abs16x16_y2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
+int pix_abs16x16_xy2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
+
+int pix_abs16x16_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
+int pix_abs16x16_x2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
+int pix_abs16x16_y2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
+int pix_abs16x16_xy2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
+
+int pix_abs8x8_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
+int pix_abs8x8_x2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
+int pix_abs8x8_y2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
+int pix_abs8x8_xy2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
+
+int pix_abs8x8_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
+int pix_abs8x8_x2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
+int pix_abs8x8_y2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
+int pix_abs8x8_xy2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
+
 
 /* external functions, from idct_mmx.c */
 void ff_mmx_idct(DCTELEM *block);
 void ff_mmxext_idct(DCTELEM *block);
 
 /* pixel operations */
-static const unsigned long long int mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001;
-static const unsigned long long int mm_wtwo __attribute__ ((aligned(8))) = 0x0002000200020002;
+static const unsigned long long int mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001LL;
+static const unsigned long long int mm_wtwo __attribute__ ((aligned(8))) = 0x0002000200020002LL;
 //static const unsigned short mm_wone[4] __attribute__ ((aligned(8))) = { 0x1, 0x1, 0x1, 0x1 };
 //static const unsigned short mm_wtwo[4] __attribute__ ((aligned(8))) = { 0x2, 0x2, 0x2, 0x2 };
 
@@ -1035,10 +1050,14 @@ void dsputil_init_mmx(void)
         put_pixels_clamped = put_pixels_clamped_mmx;
         add_pixels_clamped = add_pixels_clamped_mmx;
         
-        pix_abs16x16 = pix_abs16x16_mmx;
-        pix_abs16x16_x2 = pix_abs16x16_x2_mmx;
-        pix_abs16x16_y2 = pix_abs16x16_y2_mmx;
+        pix_abs16x16     = pix_abs16x16_mmx;
+        pix_abs16x16_x2  = pix_abs16x16_x2_mmx;
+        pix_abs16x16_y2  = pix_abs16x16_y2_mmx;
         pix_abs16x16_xy2 = pix_abs16x16_xy2_mmx;
+        pix_abs8x8    = pix_abs8x8_mmx;
+        pix_abs8x8_x2 = pix_abs8x8_x2_mmx;
+        pix_abs8x8_y2 = pix_abs8x8_y2_mmx;
+        pix_abs8x8_xy2= pix_abs8x8_xy2_mmx;
         av_fdct = fdct_mmx;
 
         put_pixels_tab[0] = put_pixels_mmx;
@@ -1067,10 +1086,16 @@ void dsputil_init_mmx(void)
         sub_pixels_tab[3] = sub_pixels_xy2_mmx;
 
         if (mm_flags & MM_MMXEXT) {
-            pix_abs16x16 = pix_abs16x16_sse;
-        }
-
-        if (mm_flags & MM_SSE) {
+            pix_abs16x16    = pix_abs16x16_mmx2;
+            pix_abs16x16_x2 = pix_abs16x16_x2_mmx2;
+            pix_abs16x16_y2 = pix_abs16x16_y2_mmx2;
+            pix_abs16x16_xy2= pix_abs16x16_xy2_mmx2;
+            
+            pix_abs8x8    = pix_abs8x8_mmx2;
+            pix_abs8x8_x2 = pix_abs8x8_x2_mmx2;
+            pix_abs8x8_y2 = pix_abs8x8_y2_mmx2;
+            pix_abs8x8_xy2= pix_abs8x8_xy2_mmx2;
+            
             put_pixels_tab[1] = put_pixels_x2_sse;
             put_pixels_tab[2] = put_pixels_y2_sse;
             
diff --git a/libavcodec/i386/motion_est_mmx.c b/libavcodec/i386/motion_est_mmx.c
index 35b16b711c..e704c42194 100644
--- a/libavcodec/i386/motion_est_mmx.c
+++ b/libavcodec/i386/motion_est_mmx.c
@@ -16,229 +16,347 @@
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  *
+ * mostly by Michael Niedermayer <michaelni@gmx.at>
  */
 #include "../dsputil.h"
-#include "mmx.h"
 
-static const unsigned long long int mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001;
-static const unsigned long long int mm_wtwo __attribute__ ((aligned(8))) = 0x0002000200020002;
+static const __attribute__ ((aligned(8))) UINT64 round_tab[3]={
+0x0000000000000000,
+0x0001000100010001,
+0x0002000200020002,
+};
 
-/* mm7 is accumulator, mm6 is zero */
-static inline void sad_add(const UINT8 *p1, const UINT8 *p2)
+static inline void sad8_mmx(UINT8 *blk1, UINT8 *blk2, int stride, int h)
 {
-    movq_m2r(*p1, mm0);
-    movq_m2r(*p2, mm1);
-    movq_r2r(mm0, mm2);
-    psubusb_r2r(mm1, mm0);
-    psubusb_r2r(mm2, mm1);
-    por_r2r(mm1, mm0); /* mm0 is absolute value */
-
-    movq_r2r(mm0, mm1);
-    punpcklbw_r2r(mm6, mm0);
-    punpckhbw_r2r(mm6, mm1);
-    paddusw_r2r(mm0, mm7);
-    paddusw_r2r(mm1, mm7);
+    int len= -(stride<<h);
+    asm volatile(
+        ".balign 16			\n\t"
+        "1:				\n\t"
+        "movq (%1, %%eax), %%mm0	\n\t"
+        "movq (%2, %%eax), %%mm2	\n\t"
+        "movq (%2, %%eax), %%mm4	\n\t"
+        "addl %3, %%eax			\n\t"
+        "psubusb %%mm0, %%mm2		\n\t"
+        "psubusb %%mm4, %%mm0		\n\t"
+        "movq (%1, %%eax), %%mm1	\n\t"
+        "movq (%2, %%eax), %%mm3	\n\t"
+        "movq (%2, %%eax), %%mm5	\n\t"
+        "psubusb %%mm1, %%mm3		\n\t"
+        "psubusb %%mm5, %%mm1		\n\t"
+        "por %%mm2, %%mm0		\n\t"
+        "por %%mm1, %%mm3		\n\t"
+        "movq %%mm0, %%mm1		\n\t"
+        "movq %%mm3, %%mm2		\n\t"
+        "punpcklbw %%mm7, %%mm0		\n\t"
+        "punpckhbw %%mm7, %%mm1		\n\t"
+        "punpcklbw %%mm7, %%mm3		\n\t"
+        "punpckhbw %%mm7, %%mm2		\n\t"
+        "paddw %%mm1, %%mm0		\n\t"
+        "paddw %%mm3, %%mm2		\n\t"
+        "paddw %%mm2, %%mm0		\n\t"
+        "paddw %%mm0, %%mm6		\n\t"
+        "addl %3, %%eax			\n\t"
+        " js 1b				\n\t"
+        : "+a" (len)
+        : "r" (blk1 - len), "r" (blk2 - len), "r" (stride)
+    );
 }
 
-/* convert mm7 to value */
-static inline int sad_end(void)
+static inline void sad8_mmx2(UINT8 *blk1, UINT8 *blk2, int stride, int h)
 {
-    int res;
-
-    movq_r2r(mm7, mm0);
-    psrlq_i2r(32, mm7);
-    paddusw_r2r(mm0, mm7);
-
-    movq_r2r(mm7, mm0);
-    psrlq_i2r(16, mm7);
-    paddusw_r2r(mm0, mm7);
-    __asm __volatile ("movd %%mm7, %0" : "=a" (res));
-    return res & 0xffff;
+    int len= -(stride<<h);
+    asm volatile(
+        ".balign 16			\n\t"
+        "1:				\n\t"
+        "movq (%1, %%eax), %%mm0	\n\t"
+        "movq (%2, %%eax), %%mm2	\n\t"
+        "psadbw %%mm2, %%mm0		\n\t"
+        "addl %3, %%eax			\n\t"
+        "movq (%1, %%eax), %%mm1	\n\t"
+        "movq (%2, %%eax), %%mm3	\n\t"
+        "psadbw %%mm1, %%mm3		\n\t"
+        "paddw %%mm3, %%mm0		\n\t"
+        "paddw %%mm0, %%mm6		\n\t"
+        "addl %3, %%eax			\n\t"
+        " js 1b				\n\t"
+        : "+a" (len)
+        : "r" (blk1 - len), "r" (blk2 - len), "r" (stride)
+    );
 }
 
-int pix_abs16x16_mmx(UINT8 *blk1, UINT8 *blk2, int lx, int h)
+static inline void sad8_2_mmx2(UINT8 *blk1a, UINT8 *blk1b, UINT8 *blk2, int stride, int h)
 {
-    const UINT8 *p1, *p2;
-
-    h >>= 1;
-    p1 = blk1;
-    p2 = blk2;
-    pxor_r2r(mm7, mm7); /* mm7 is accumulator */
-    pxor_r2r(mm6, mm6); /* mm7 is zero constant */
-    do {
-        sad_add(p1, p2);
-        sad_add(p1 + 8, p2 + 8);
-        p1 += lx;
-        p2 += lx;
-        sad_add(p1, p2);
-        sad_add(p1 + 8, p2 + 8);
-        p1 += lx;
-        p2 += lx;
-    } while (--h);
-    return sad_end();
+    int len= -(stride<<h);
+    asm volatile(
+        ".balign 16			\n\t"
+        "1:				\n\t"
+        "movq (%1, %%eax), %%mm0	\n\t"
+        "movq (%2, %%eax), %%mm2	\n\t"
+        "pavgb %%mm2, %%mm0		\n\t"
+        "movq (%3, %%eax), %%mm2	\n\t"
+        "psadbw %%mm2, %%mm0		\n\t"
+        "addl %4, %%eax			\n\t"
+        "movq (%1, %%eax), %%mm1	\n\t"
+        "movq (%2, %%eax), %%mm3	\n\t"
+        "pavgb %%mm1, %%mm3		\n\t"
+        "movq (%3, %%eax), %%mm1	\n\t"
+        "psadbw %%mm1, %%mm3		\n\t"
+        "paddw %%mm3, %%mm0		\n\t"
+        "paddw %%mm0, %%mm6		\n\t"
+        "addl %4, %%eax			\n\t"
+        " js 1b				\n\t"
+        : "+a" (len)
+        : "r" (blk1a - len), "r" (blk1b -len), "r" (blk2 - len), "r" (stride)
+    );
 }
 
-/* please test it ! */
-static inline void sad_add_sse(const UINT8 *p1, const UINT8 *p2)
-{
-    movq_m2r(*(p1 + 0), mm0);
-    movq_m2r(*(p1 + 8), mm1);
-    psadbw_m2r(*(p2 + 0), mm0);
-    psadbw_m2r(*(p2 + 8), mm1);
-    paddusw_r2r(mm0, mm7);
-    paddusw_r2r(mm1, mm7);
+static inline void sad8_4_mmx2(UINT8 *blk1, UINT8 *blk2, int stride, int h)
+{ //FIXME reuse src
+    int len= -(stride<<h);
+    asm volatile(
+        ".balign 16			\n\t"
+        "1:				\n\t" 
+        "movq (%1, %%eax), %%mm0	\n\t"
+        "movq (%2, %%eax), %%mm2	\n\t"
+        "movq 1(%1, %%eax), %%mm1	\n\t"
+        "movq 1(%2, %%eax), %%mm3	\n\t"
+        "pavgb %%mm2, %%mm0		\n\t"
+        "pavgb %%mm1, %%mm3		\n\t"
+        "pavgb %%mm3, %%mm0		\n\t"
+        "movq (%3, %%eax), %%mm2	\n\t"
+        "psadbw %%mm2, %%mm0		\n\t"
+        "addl %4, %%eax			\n\t"
+        "movq (%1, %%eax), %%mm1	\n\t"
+        "movq (%2, %%eax), %%mm3	\n\t"
+        "movq 1(%1, %%eax), %%mm2	\n\t"
+        "movq 1(%2, %%eax), %%mm4	\n\t"
+        "pavgb %%mm3, %%mm1		\n\t"
+        "pavgb %%mm4, %%mm2		\n\t"
+        "pavgb %%mm1, %%mm2		\n\t"
+        "movq (%3, %%eax), %%mm1	\n\t"
+        "psadbw %%mm1, %%mm2		\n\t"
+        "paddw %%mm2, %%mm0		\n\t"
+        "paddw %%mm0, %%mm6		\n\t"
+        "addl %4, %%eax			\n\t"
+        " js 1b				\n\t"
+        : "+a" (len)
+        : "r" (blk1 - len), "r" (blk1 - len + stride), "r" (blk2 - len), "r" (stride)
+    );
 }
 
-int pix_abs16x16_sse(UINT8 *blk1, UINT8 *blk2, int lx, int h)
+static inline void sad8_2_mmx(UINT8 *blk1a, UINT8 *blk1b, UINT8 *blk2, int stride, int h)
 {
-    const UINT8 *p1, *p2;
-
-    h >>= 1;
-    p1 = blk1;
-    p2 = blk2;
-    pxor_r2r(mm7, mm7); /* mm7 is accumulator */
-    do {
-        sad_add_sse(p1, p2);
-        p1 += lx;
-        p2 += lx;
-        sad_add_sse(p1, p2);
-        p1 += lx;
-        p2 += lx;
-    } while (--h);
-    return sad_end();
+    int len= -(stride<<h);
+    asm volatile(
+        ".balign 16			\n\t"
+        "1:				\n\t"
+        "movq (%1, %%eax), %%mm0	\n\t"
+        "movq (%2, %%eax), %%mm1	\n\t"
+        "movq (%1, %%eax), %%mm2	\n\t"
+        "movq (%2, %%eax), %%mm3	\n\t"
+        "punpcklbw %%mm7, %%mm0		\n\t"
+        "punpcklbw %%mm7, %%mm1		\n\t"
+        "punpckhbw %%mm7, %%mm2		\n\t"
+        "punpckhbw %%mm7, %%mm3		\n\t"
+        "paddw %%mm0, %%mm1		\n\t"
+        "paddw %%mm2, %%mm3		\n\t"
+        "movq (%3, %%eax), %%mm4	\n\t" 
+        "movq (%3, %%eax), %%mm2	\n\t"
+        "paddw %%mm5, %%mm1		\n\t"
+        "paddw %%mm5, %%mm3		\n\t"
+        "psrlw $1, %%mm1		\n\t"
+        "psrlw $1, %%mm3		\n\t"
+        "packuswb %%mm3, %%mm1		\n\t"
+        "psubusb %%mm1, %%mm4		\n\t"
+        "psubusb %%mm2, %%mm1		\n\t"
+        "por %%mm4, %%mm1		\n\t"
+        "movq %%mm1, %%mm0		\n\t"
+        "punpcklbw %%mm7, %%mm0		\n\t"
+        "punpckhbw %%mm7, %%mm1		\n\t"
+        "paddw %%mm1, %%mm0		\n\t"
+        "paddw %%mm0, %%mm6		\n\t"
+        "addl %4, %%eax			\n\t"
+        " js 1b				\n\t"
+        : "+a" (len)
+        : "r" (blk1a - len), "r" (blk1b -len), "r" (blk2 - len), "r" (stride)
+    );
 }
 
-#define DUMP(reg) { mmx_t tmp; movq_r2m(reg, tmp); printf(#reg "=%016Lx\n", tmp.uq); }
-
-/* mm7 is accumulator, mm6 is zero */
-static inline void sad_add_x2(const UINT8 *p1, const UINT8 *p2, const UINT8 *p3)
+static inline void sad8_4_mmx(UINT8 *blk1, UINT8 *blk2, int stride, int h)
 {
-    movq_m2r(*(p2 + 0), mm0);
-    movq_m2r(*(p3 + 0), mm1);
-    movq_r2r(mm0, mm2);
-    movq_r2r(mm1, mm3);
-    punpcklbw_r2r(mm6, mm0); /* extract 4 bytes low */
-    punpcklbw_r2r(mm6, mm1);
-    punpckhbw_r2r(mm6, mm2); /* high */
-    punpckhbw_r2r(mm6, mm3); 
-    paddusw_r2r(mm1, mm0);
-    paddusw_r2r(mm3, mm2);
-    movq_m2r(*(p1 + 0), mm1); /* mm1 : other value */
-    paddusw_r2r(mm5, mm0); /* + 1 */
-    paddusw_r2r(mm5, mm2); /* + 1 */
-    psrlw_i2r(1, mm0);
-    psrlw_i2r(1, mm2);
-    packuswb_r2r(mm2, mm0); /* average is in mm0 */
-
-    movq_r2r(mm1, mm2); 
-    psubusb_r2r(mm0, mm1);
-    psubusb_r2r(mm2, mm0);
-    por_r2r(mm1, mm0); /* mm0 is absolute value */
-
-    movq_r2r(mm0, mm1);
-    punpcklbw_r2r(mm6, mm0);
-    punpckhbw_r2r(mm6, mm1);
-    paddusw_r2r(mm0, mm7);
-    paddusw_r2r(mm1, mm7);
+    int len= -(stride<<h);
+    asm volatile(
+        ".balign 16			\n\t"
+        "1:				\n\t"
+        "movq (%1, %%eax), %%mm0	\n\t"
+        "movq (%2, %%eax), %%mm1	\n\t"
+        "movq %%mm0, %%mm4		\n\t"
+        "movq %%mm1, %%mm2		\n\t"
+        "punpcklbw %%mm7, %%mm0		\n\t"
+        "punpcklbw %%mm7, %%mm1		\n\t"
+        "punpckhbw %%mm7, %%mm4		\n\t"
+        "punpckhbw %%mm7, %%mm2		\n\t"
+        "paddw %%mm1, %%mm0		\n\t"
+        "paddw %%mm2, %%mm4		\n\t"
+        "movq 1(%1, %%eax), %%mm2	\n\t"
+        "movq 1(%2, %%eax), %%mm3	\n\t"
+        "movq %%mm2, %%mm1		\n\t"
+        "punpcklbw %%mm7, %%mm2		\n\t"
+        "punpckhbw %%mm7, %%mm1		\n\t"
+        "paddw %%mm0, %%mm2		\n\t"
+        "paddw %%mm4, %%mm1		\n\t"
+        "movq %%mm3, %%mm4		\n\t"
+        "punpcklbw %%mm7, %%mm3		\n\t"
+        "punpckhbw %%mm7, %%mm4		\n\t"
+        "paddw %%mm3, %%mm2		\n\t"
+        "paddw %%mm4, %%mm1		\n\t"
+        "movq (%3, %%eax), %%mm3	\n\t" 
+        "movq (%3, %%eax), %%mm4	\n\t" 
+        "paddw %%mm5, %%mm2		\n\t"
+        "paddw %%mm5, %%mm1		\n\t"
+        "psrlw $2, %%mm2		\n\t"
+        "psrlw $2, %%mm1		\n\t"
+        "packuswb %%mm1, %%mm2		\n\t"
+        "psubusb %%mm2, %%mm3		\n\t"
+        "psubusb %%mm4, %%mm2		\n\t"
+        "por %%mm3, %%mm2		\n\t"
+        "movq %%mm2, %%mm0		\n\t"
+        "punpcklbw %%mm7, %%mm0		\n\t"
+        "punpckhbw %%mm7, %%mm2		\n\t"
+        "paddw %%mm2, %%mm0		\n\t"
+        "paddw %%mm0, %%mm6		\n\t"
+        "addl %4, %%eax			\n\t"
+        " js 1b				\n\t"
+        : "+a" (len)
+        : "r" (blk1 - len), "r" (blk1 -len + stride), "r" (blk2 - len), "r" (stride)
+    );
 }
 
-int pix_abs16x16_x2_mmx(UINT8 *blk1, UINT8 *blk2, int lx, int h)
+static inline int sum_mmx()
 {
-    const UINT8 *p1, *p2;
-
-    p1 = blk1;
-    p2 = blk2;
-    pxor_r2r(mm7, mm7); /* mm7 is accumulator */
-    pxor_r2r(mm6, mm6); /* mm7 is zero constant */
-    movq_m2r(mm_wone, mm5); /* one constant */
-    do {
-        sad_add_x2(p1, p2, p2 + 1);
-        sad_add_x2(p1 + 8, p2 + 8, p2 + 9);
-        p1 += lx;
-        p2 += lx;
-    } while (--h);
-    return sad_end();
+    int ret;
+    asm volatile(
+        "movq %%mm6, %%mm0		\n\t"
+        "psrlq $32, %%mm6		\n\t"
+        "paddw %%mm0, %%mm6		\n\t"
+        "movq %%mm6, %%mm0		\n\t"
+        "psrlq $16, %%mm6		\n\t"
+        "paddw %%mm0, %%mm6		\n\t"
+        "movd %%mm6, %0			\n\t"
+        : "=r" (ret)
+    );
+    return ret&0xFFFF;
 }
 
-int pix_abs16x16_y2_mmx(UINT8 *blk1, UINT8 *blk2, int lx, int h)
+static inline int sum_mmx2()
 {
-    const UINT8 *p1, *p2;
-
-    p1 = blk1;
-    p2 = blk2;
-    pxor_r2r(mm7, mm7); /* mm7 is accumulator */
-    pxor_r2r(mm6, mm6); /* mm7 is zero constant */
-    movq_m2r(mm_wone, mm5); /* one constant */
-    do {
-        sad_add_x2(p1, p2, p2 + lx);
-        sad_add_x2(p1 + 8, p2 + 8, p2 + 8 + lx);
-        p1 += lx;
-        p2 += lx;
-    } while (--h);
-    return sad_end();
+    int ret;
+    asm volatile(
+        "movd %%mm6, %0			\n\t"
+        : "=r" (ret)
+    );
+    return ret;
 }
 
-/* mm7 is accumulator, mm6 is zero */
-static inline void sad_add_xy2(const UINT8 *p1, const UINT8 *p2, const UINT8 *p3)
-{
-    movq_m2r(*(p2 + 0), mm0);
-    movq_m2r(*(p3 + 0), mm1);
-    movq_r2r(mm0, mm2);
-    movq_r2r(mm1, mm3);
-    punpcklbw_r2r(mm6, mm0); /* extract 4 bytes low */
-    punpcklbw_r2r(mm6, mm1);
-    punpckhbw_r2r(mm6, mm2); /* high */
-    punpckhbw_r2r(mm6, mm3); 
-    paddusw_r2r(mm1, mm0);
-    paddusw_r2r(mm3, mm2);
-
-    movq_m2r(*(p2 + 1), mm1);
-    movq_m2r(*(p3 + 1), mm3);
-    movq_r2r(mm1, mm4);
-    punpcklbw_r2r(mm6, mm1); /* low */
-    punpckhbw_r2r(mm6, mm4); /* high */
-    paddusw_r2r(mm1, mm0);
-    paddusw_r2r(mm4, mm2);
-    movq_r2r(mm3, mm4);
-    punpcklbw_r2r(mm6, mm3); /* low */
-    punpckhbw_r2r(mm6, mm4); /* high */
-    paddusw_r2r(mm3, mm0);
-    paddusw_r2r(mm4, mm2);
-    
-    movq_m2r(*(p1 + 0), mm1); /* mm1 : other value */
-    paddusw_r2r(mm5, mm0); /* + 2 */
-    paddusw_r2r(mm5, mm2); /* + 2 */
-    psrlw_i2r(2, mm0);
-    psrlw_i2r(2, mm2);
-    packuswb_r2r(mm2, mm0); /* average is in mm0 */
+#define PIX_SAD(suf)\
+int pix_abs8x8_ ## suf(UINT8 *blk2, UINT8 *blk1, int stride)\
+{\
+    asm volatile("pxor %%mm7, %%mm7		\n\t"\
+                 "pxor %%mm6, %%mm6		\n\t":);\
+\
+    sad8_ ## suf(blk1, blk2, stride, 3);\
+\
+    return sum_ ## suf();\
+}\
+\
+int pix_abs8x8_x2_ ## suf(UINT8 *blk2, UINT8 *blk1, int stride)\
+{\
+    asm volatile("pxor %%mm7, %%mm7		\n\t"\
+                 "pxor %%mm6, %%mm6		\n\t"\
+                 "movq %0, %%mm5		\n\t"\
+                 :: "m"(round_tab[1]) \
+                 );\
+\
+    sad8_2_ ## suf(blk1, blk2+1, blk2, stride, 3);\
+\
+    return sum_ ## suf();\
+}\
+\
+int pix_abs8x8_y2_ ## suf(UINT8 *blk2, UINT8 *blk1, int stride)\
+{\
+    asm volatile("pxor %%mm7, %%mm7		\n\t"\
+                 "pxor %%mm6, %%mm6		\n\t"\
+                 "movq %0, %%mm5		\n\t"\
+                 :: "m"(round_tab[1]) \
+                 );\
+\
+    sad8_2_ ## suf(blk1, blk1+stride, blk2, stride, 3);\
+\
+    return sum_ ## suf();\
+}\
+\
+int pix_abs8x8_xy2_ ## suf(UINT8 *blk2, UINT8 *blk1, int stride)\
+{\
+    asm volatile("pxor %%mm7, %%mm7		\n\t"\
+                 "pxor %%mm6, %%mm6		\n\t"\
+                 "movq %0, %%mm5		\n\t"\
+                 :: "m"(round_tab[2]) \
+                 );\
+\
+    sad8_4_ ## suf(blk1, blk2, stride, 3);\
+\
+    return sum_ ## suf();\
+}\
+\
+int pix_abs16x16_ ## suf(UINT8 *blk2, UINT8 *blk1, int stride)\
+{\
+    asm volatile("pxor %%mm7, %%mm7		\n\t"\
+                 "pxor %%mm6, %%mm6		\n\t":);\
+\
+    sad8_ ## suf(blk1  , blk2  , stride, 4);\
+    sad8_ ## suf(blk1+8, blk2+8, stride, 4);\
+\
+    return sum_ ## suf();\
+}\
+int pix_abs16x16_x2_ ## suf(UINT8 *blk2, UINT8 *blk1, int stride)\
+{\
+    asm volatile("pxor %%mm7, %%mm7		\n\t"\
+                 "pxor %%mm6, %%mm6		\n\t"\
+                 "movq %0, %%mm5		\n\t"\
+                 :: "m"(round_tab[1]) \
+                 );\
+\
+    sad8_2_ ## suf(blk1  , blk1+1, blk2  , stride, 4);\
+    sad8_2_ ## suf(blk1+8, blk1+9, blk2+8, stride, 4);\
+\
+    return sum_ ## suf();\
+}\
+int pix_abs16x16_y2_ ## suf(UINT8 *blk2, UINT8 *blk1, int stride)\
+{\
+    asm volatile("pxor %%mm7, %%mm7		\n\t"\
+                 "pxor %%mm6, %%mm6		\n\t"\
+                 "movq %0, %%mm5		\n\t"\
+                 :: "m"(round_tab[1]) \
+                 );\
+\
+    sad8_2_ ## suf(blk1  , blk1+stride,  blk2  , stride, 4);\
+    sad8_2_ ## suf(blk1+8, blk1+stride+8,blk2+8, stride, 4);\
+\
+    return sum_ ## suf();\
+}\
+int pix_abs16x16_xy2_ ## suf(UINT8 *blk2, UINT8 *blk1, int stride)\
+{\
+    asm volatile("pxor %%mm7, %%mm7		\n\t"\
+                 "pxor %%mm6, %%mm6		\n\t"\
+                 "movq %0, %%mm5		\n\t"\
+                 :: "m"(round_tab[2]) \
+                 );\
+\
+    sad8_4_ ## suf(blk1  , blk2  , stride, 4);\
+    sad8_4_ ## suf(blk1+8, blk2+8, stride, 4);\
+\
+    return sum_ ## suf();\
+}\
 
-    movq_r2r(mm1, mm2); 
-    psubusb_r2r(mm0, mm1);
-    psubusb_r2r(mm2, mm0);
-    por_r2r(mm1, mm0); /* mm0 is absolute value */
-
-    movq_r2r(mm0, mm1);
-    punpcklbw_r2r(mm6, mm0);
-    punpckhbw_r2r(mm6, mm1);
-    paddusw_r2r(mm0, mm7);
-    paddusw_r2r(mm1, mm7);
-}
-
-int pix_abs16x16_xy2_mmx(UINT8 *blk1, UINT8 *blk2, int lx, int h)
-{
-    const UINT8 *p1, *p2, *p3;
-
-    p1 = blk1;
-    p2 = blk2;
-    p3 = blk2 + lx;
-    pxor_r2r(mm7, mm7); /* mm7 is accumulator */
-    pxor_r2r(mm6, mm6); /* mm7 is zero constant */
-    movq_m2r(mm_wtwo, mm5); /* one constant */
-    do {
-        sad_add_xy2(p1, p2, p2 + lx);
-        sad_add_xy2(p1 + 8, p2 + 8, p2 + 8 + lx);
-        p1 += lx;
-        p2 += lx;
-    } while (--h);
-    return sad_end();
-}
+PIX_SAD(mmx)
+PIX_SAD(mmx2)
diff --git a/libavcodec/motion_est.c b/libavcodec/motion_est.c
index ce80505bf9..8c4bddd370 100644
--- a/libavcodec/motion_est.c
+++ b/libavcodec/motion_est.c
@@ -26,6 +26,7 @@
 #include "mpegvideo.h"
 
 #define ABS(a) ((a)>0 ? (a) : -(a))
+#define MAX(a,b) ((a) > (b) ? (a) : (b))
 #define INTER_BIAS	257
 
 static void halfpel_motion_search(MpegEncContext * s,
@@ -164,7 +165,7 @@ static int full_motion_search(MpegEncContext * s,
     for (y = y1; y <= y2; y++) {
 	for (x = x1; x <= x2; x++) {
 	    d = pix_abs16x16(pix, s->last_picture[0] + (y * s->linesize) + x,
-			     s->linesize, 16);
+			     s->linesize);
 	    if (d < dmin ||
 		(d == dmin &&
 		 (abs(x - xx) + abs(y - yy)) <
@@ -228,7 +229,7 @@ static int log_motion_search(MpegEncContext * s,
     do {
 	for (y = y1; y <= y2; y += range) {
 	    for (x = x1; x <= x2; x += range) {
-		d = pix_abs16x16(pix, s->last_picture[0] + (y * s->linesize) + x, s->linesize, 16);
+		d = pix_abs16x16(pix, s->last_picture[0] + (y * s->linesize) + x, s->linesize);
 		if (d < dmin || (d == dmin && (abs(x - xx) + abs(y - yy)) < (abs(mx - xx) + abs(my - yy)))) {
 		    dmin = d;
 		    mx = x;
@@ -308,7 +309,7 @@ static int phods_motion_search(MpegEncContext * s,
 
 	lastx = x;
 	for (x = x1; x <= x2; x += range) {
-	    d = pix_abs16x16(pix, s->last_picture[0] + (y * s->linesize) + x, s->linesize, 16);
+	    d = pix_abs16x16(pix, s->last_picture[0] + (y * s->linesize) + x, s->linesize);
 	    if (d < dminx || (d == dminx && (abs(x - xx) + abs(y - yy)) < (abs(mx - xx) + abs(my - yy)))) {
 		dminx = d;
 		mx = x;
@@ -317,7 +318,7 @@ static int phods_motion_search(MpegEncContext * s,
 
 	x = lastx;
 	for (y = y1; y <= y2; y += range) {
-	    d = pix_abs16x16(pix, s->last_picture[0] + (y * s->linesize) + x, s->linesize, 16);
+	    d = pix_abs16x16(pix, s->last_picture[0] + (y * s->linesize) + x, s->linesize);
 	    if (d < dminy || (d == dminy && (abs(x - xx) + abs(y - yy)) < (abs(mx - xx) + abs(my - yy)))) {
 		dminy = d;
 		my = y;
@@ -361,7 +362,7 @@ static int phods_motion_search(MpegEncContext * s,
 
 #define CHECK_MV(x,y)\
 {\
-    d = pix_abs16x16(new_pic, old_pic + (x) + (y)*pic_stride, pic_stride, 16);\
+    d = pix_abs16x16(new_pic, old_pic + (x) + (y)*pic_stride, pic_stride);\
     d += (mv_penalty[((x)<<shift)-pred_x] + mv_penalty[((y)<<shift)-pred_y])*quant;\
     if(d<dmin){\
         best[0]=x;\
@@ -372,7 +373,7 @@ static int phods_motion_search(MpegEncContext * s,
 
 #define CHECK_MV_DIR(x,y,new_dir)\
 {\
-    d = pix_abs16x16(new_pic, old_pic + (x) + (y)*pic_stride, pic_stride, 16);\
+    d = pix_abs16x16(new_pic, old_pic + (x) + (y)*pic_stride, pic_stride);\
     d += (mv_penalty[((x)<<shift)-pred_x] + mv_penalty[((y)<<shift)-pred_y])*quant;\
     if(d<dmin){\
         best[0]=x;\
@@ -382,6 +383,30 @@ static int phods_motion_search(MpegEncContext * s,
     }\
 }
 
+#define CHECK_MV4(x,y)\
+{\
+    d = pix_abs8x8(new_pic, old_pic + (x) + (y)*pic_stride, pic_stride);\
+    d += (mv_penalty[((x)<<shift)-pred_x] + mv_penalty[((y)<<shift)-pred_y])*quant;\
+    if(d<dmin){\
+        best[0]=x;\
+        best[1]=y;\
+        dmin=d;\
+    }\
+}
+
+#define CHECK_MV4_DIR(x,y,new_dir)\
+{\
+    d = pix_abs8x8(new_pic, old_pic + (x) + (y)*pic_stride, pic_stride);\
+    d += (mv_penalty[((x)<<shift)-pred_x] + mv_penalty[((y)<<shift)-pred_y])*quant;\
+    if(d<dmin){\
+        best[0]=x;\
+        best[1]=y;\
+        dmin=d;\
+        next_dir= new_dir;\
+    }\
+}
+
+
 #define check(x,y,S,v)\
 if( (x)<(xmin<<(S)) ) printf("%d %d %d %d xmin" #v, (x), (y), s->mb_x, s->mb_y);\
 if( (x)>(xmax<<(S)) ) printf("%d %d %d %d xmax" #v, (x), (y), s->mb_x, s->mb_y);\
@@ -440,6 +465,32 @@ static inline int small_diamond_search(MpegEncContext * s, int *best, int dmin,
     */
 }
 
+static inline int small_diamond_search4MV(MpegEncContext * s, int *best, int dmin,
+                                       UINT8 *new_pic, UINT8 *old_pic, int pic_stride,
+                                       int pred_x, int pred_y, UINT16 *mv_penalty, int quant,
+                                       int xmin, int ymin, int xmax, int ymax, int shift)
+{
+    int next_dir=-1;
+
+    for(;;){
+        int d;
+        const int dir= next_dir;
+        const int x= best[0];
+        const int y= best[1];
+        next_dir=-1;
+
+//printf("%d", dir);
+        if(dir!=2 && x>xmin) CHECK_MV4_DIR(x-1, y  , 0)
+        if(dir!=3 && y>ymin) CHECK_MV4_DIR(x  , y-1, 1)
+        if(dir!=0 && x<xmax) CHECK_MV4_DIR(x+1, y  , 2)
+        if(dir!=1 && y<ymax) CHECK_MV4_DIR(x  , y+1, 3)
+
+        if(next_dir==-1){
+            return dmin;
+        }
+    }
+}
+
 static inline int snake_search(MpegEncContext * s, int *best, int dmin,
                                        UINT8 *new_pic, UINT8 *old_pic, int pic_stride,
                                        int pred_x, int pred_y, UINT16 *mv_penalty, int quant,
@@ -469,7 +520,7 @@ if(256*256*256*64%point==0)
         x+=x_dir[dir];
         y+=y_dir[dir];
         if(x>=xmin && x<=xmax && y>=ymin && y<=ymax){
-            d = pix_abs16x16(new_pic, old_pic + (x) + (y)*pic_stride, pic_stride, 16);
+            d = pix_abs16x16(new_pic, old_pic + (x) + (y)*pic_stride, pic_stride);
             d += (mv_penalty[((x)<<shift)-pred_x] + mv_penalty[((y)<<shift)-pred_y])*quant;
         }else{
             d = dmin + 10000; //FIXME smarter boundary handling
@@ -517,7 +568,7 @@ static int epzs_motion_search(MpegEncContext * s,
     new_pic = s->new_picture[0] + pic_xy;
     old_pic = s->last_picture[0] + pic_xy;
    
-    dmin = pix_abs16x16(new_pic, old_pic, pic_stride, 16);
+    dmin = pix_abs16x16(new_pic, old_pic, pic_stride);
     if(dmin<Z_THRESHOLD){
         *mx_ptr= 0;
         *my_ptr= 0;
@@ -557,8 +608,56 @@ static int epzs_motion_search(MpegEncContext * s,
     return dmin;
 }
 
+static int epzs_motion_search4(MpegEncContext * s, int block,
+                             int *mx_ptr, int *my_ptr,
+                             int P[6][2], int pred_x, int pred_y,
+                             int xmin, int ymin, int xmax, int ymax)
+{
+    int best[2]={0, 0};
+    int d, dmin; 
+    UINT8 *new_pic, *old_pic;
+    const int pic_stride= s->linesize;
+    const int pic_xy= ((s->mb_y*2 + (block>>1))*pic_stride + s->mb_x*2 + (block&1))*8;
+    UINT16 *mv_penalty= s->mv_penalty[s->f_code] + MAX_MV; // f_code of the prev frame
+    int quant= s->qscale; // qscale of the prev frame
+    const int shift= 1+s->quarter_sample;
+
+    new_pic = s->new_picture[0] + pic_xy;
+    old_pic = s->last_picture[0] + pic_xy;
+   
+    dmin = pix_abs8x8(new_pic, old_pic, pic_stride);
+
+    /* first line */
+    if ((s->mb_y == 0 || s->first_slice_line || s->first_gob_line) && block<2) {
+        CHECK_MV4(P[1][0]>>shift, P[1][1]>>shift)
+    }else{
+        CHECK_MV4(P[4][0]>>shift, P[4][1]>>shift)
+        if(dmin<Z_THRESHOLD){
+            *mx_ptr= P[4][0]>>shift;
+            *my_ptr= P[4][1]>>shift;
+//printf("M\n");
+            return dmin;
+        }
+        CHECK_MV4(P[1][0]>>shift, P[1][1]>>shift)
+        CHECK_MV4(P[2][0]>>shift, P[2][1]>>shift)
+        CHECK_MV4(P[3][0]>>shift, P[3][1]>>shift)
+    }
+    CHECK_MV4(P[0][0]>>shift, P[0][1]>>shift)
+    CHECK_MV4(P[5][0]>>shift, P[5][1]>>shift)
+
+//check(best[0],best[1],0, b0)
+    dmin= small_diamond_search4MV(s, best, dmin, new_pic, old_pic, pic_stride, 
+                                   pred_x, pred_y, mv_penalty, quant, xmin, ymin, xmax, ymax, shift);
+//check(best[0],best[1],0, b1)
+    *mx_ptr= best[0];
+    *my_ptr= best[1];    
+
+//    printf("%d %d %d \n", best[0], best[1], dmin);
+    return dmin;
+}
+
 #define CHECK_HALF_MV(suffix, x, y) \
-    d= pix_abs16x16_ ## suffix(pix, ptr+((x)>>1), s->linesize, 16);\
+    d= pix_abs16x16_ ## suffix(pix, ptr+((x)>>1), s->linesize);\
     d += (mv_penalty[pen_x + x] + mv_penalty[pen_y + y])*quant;\
     if(d<dminh){\
         dminh= d;\
@@ -566,6 +665,15 @@ static int epzs_motion_search(MpegEncContext * s,
         my= my1 + y;\
     }
 
+#define CHECK_HALF_MV4(suffix, x, y) \
+    d= pix_abs8x8_ ## suffix(pix, ptr+((x)>>1), s->linesize);\
+    d += (mv_penalty[pen_x + x] + mv_penalty[pen_y + y])*quant;\
+    if(d<dminh){\
+        dminh= d;\
+        mx= mx1 + x;\
+        my= my1 + y;\
+    }
+    
 /* The idea would be to make half pel ME after Inter/Intra decision to 
    save time. */
 static inline void halfpel_motion_search(MpegEncContext * s,
@@ -614,6 +722,7 @@ static inline void halfpel_motion_search(MpegEncContext * s,
         CHECK_HALF_MV(xy2, -1, +1)
         CHECK_HALF_MV(y2 ,  0, +1)
         CHECK_HALF_MV(xy2, +1, +1)
+
     }else{
         mx= 2*(mx - xx);
         my= 2*(my - yy);
@@ -623,19 +732,99 @@ static inline void halfpel_motion_search(MpegEncContext * s,
     *my_ptr = my;
 }
 
+static inline void halfpel_motion_search4(MpegEncContext * s,
+				  int *mx_ptr, int *my_ptr, int dmin,
+				  int xmin, int ymin, int xmax, int ymax,
+                                  int pred_x, int pred_y, int block_x, int block_y)
+{
+    UINT16 *mv_penalty= s->mv_penalty[s->f_code] + MAX_MV; // f_code of the prev frame
+    const int quant= s->qscale;
+    int pen_x, pen_y;
+    int mx, my, mx1, my1, d, xx, yy, dminh;
+    UINT8 *pix, *ptr;
+
+    xx = 8 * block_x;
+    yy = 8 * block_y;
+    pix =  s->new_picture[0] + (yy * s->linesize) + xx;
+    
+    mx = *mx_ptr;
+    my = *my_ptr;
+    ptr = s->last_picture[0] + ((yy+my) * s->linesize) + xx + mx;
+
+    dminh = dmin;
+
+    if (mx > xmin && mx < xmax && 
+        my > ymin && my < ymax) {
+
+        mx= mx1= 2*mx;
+        my= my1= 2*my;
+        if(dmin < Z_THRESHOLD && mx==0 && my==0){
+            *mx_ptr = 0;
+            *my_ptr = 0;
+            return;
+        }
+        
+        pen_x= pred_x + mx;
+        pen_y= pred_y + my;
+
+        ptr-= s->linesize;
+        CHECK_HALF_MV4(xy2, -1, -1)
+        CHECK_HALF_MV4(y2 ,  0, -1)
+        CHECK_HALF_MV4(xy2, +1, -1)
+        
+        ptr+= s->linesize;
+        CHECK_HALF_MV4(x2 , -1,  0)
+        CHECK_HALF_MV4(x2 , +1,  0)
+        CHECK_HALF_MV4(xy2, -1, +1)
+        CHECK_HALF_MV4(y2 ,  0, +1)
+        CHECK_HALF_MV4(xy2, +1, +1)
+
+    }else{
+        mx*=2;
+        my*=2;
+    }
+
+    *mx_ptr = mx;
+    *my_ptr = my;
+}
+
+static inline void set_mv_tables(MpegEncContext * s, int mx, int my)
+{
+    const int xy= s->mb_x + s->mb_y*s->mb_width;
+    
+    s->mv_table[0][xy] = mx;
+    s->mv_table[1][xy] = my;
+
+    /* has allready been set to the 4 MV if 4MV is done */
+    if(!(s->flags&CODEC_FLAG_4MV)){
+        int mot_xy= s->block_index[0];
+
+        s->motion_val[mot_xy  ][0]= mx;
+        s->motion_val[mot_xy  ][1]= my;
+        s->motion_val[mot_xy+1][0]= mx;
+        s->motion_val[mot_xy+1][1]= my;
+
+        mot_xy += s->block_wrap[0];
+        s->motion_val[mot_xy  ][0]= mx;
+        s->motion_val[mot_xy  ][1]= my;
+        s->motion_val[mot_xy+1][0]= mx;
+        s->motion_val[mot_xy+1][1]= my;
+    }
+}
+
 #ifndef CONFIG_TEST_MV_ENCODE
 
-int estimate_motion(MpegEncContext * s,
-		    int mb_x, int mb_y,
-		    int *mx_ptr, int *my_ptr)
+void estimate_motion(MpegEncContext * s,
+		    int mb_x, int mb_y)
 {
     UINT8 *pix, *ppix;
     int sum, varc, vard, mx, my, range, dmin, xx, yy;
     int xmin, ymin, xmax, ymax;
     int rel_xmin, rel_ymin, rel_xmax, rel_ymax;
     int pred_x=0, pred_y=0;
-    int P[5][2];
+    int P[6][2];
     const int shift= 1+s->quarter_sample;
+    int mb_type=0;
     
     range = 8 * (1 << (s->f_code - 1));
     /* XXX: temporary kludge to avoid overflow for msmpeg4 */
@@ -680,14 +869,13 @@ int estimate_motion(MpegEncContext * s,
     case ME_X1:
     case ME_EPZS:
        {
-            static const int off[4]= {2, 1, 1, -1};
-            const int mot_stride = s->mb_width*2 + 2;
-            const int mot_xy = (s->mb_y*2 + 1)*mot_stride + s->mb_x*2 + 1;
+            const int mot_stride = s->block_wrap[0];
+            const int mot_xy = s->block_index[0];
 
-            rel_xmin= xmin - s->mb_x*16;
-            rel_xmax= xmax - s->mb_x*16;
-            rel_ymin= ymin - s->mb_y*16;
-            rel_ymax= ymax - s->mb_y*16;
+            rel_xmin= xmin - mb_x*16;
+            rel_xmax= xmax - mb_x*16;
+            rel_ymin= ymin - mb_y*16;
+            rel_ymax= ymax - mb_y*16;
 
             P[0][0] = s->motion_val[mot_xy    ][0];
             P[0][1] = s->motion_val[mot_xy    ][1];
@@ -696,14 +884,14 @@ int estimate_motion(MpegEncContext * s,
             if(P[1][0] > (rel_xmax<<shift)) P[1][0]= (rel_xmax<<shift);
 
             /* special case for first line */
-            if ((s->mb_y == 0 || s->first_slice_line || s->first_gob_line)) {
+            if ((mb_y == 0 || s->first_slice_line || s->first_gob_line)) {
                 P[4][0] = P[1][0];
                 P[4][1] = P[1][1];
             } else {
                 P[2][0] = s->motion_val[mot_xy - mot_stride             ][0];
                 P[2][1] = s->motion_val[mot_xy - mot_stride             ][1];
-                P[3][0] = s->motion_val[mot_xy - mot_stride + off[0]    ][0];
-                P[3][1] = s->motion_val[mot_xy - mot_stride + off[0]    ][1];
+                P[3][0] = s->motion_val[mot_xy - mot_stride + 2         ][0];
+                P[3][1] = s->motion_val[mot_xy - mot_stride + 2         ][1];
                 if(P[2][1] > (rel_ymax<<shift)) P[2][1]= (rel_ymax<<shift);
                 if(P[3][0] < (rel_xmin<<shift)) P[3][0]= (rel_xmin<<shift);
                 if(P[3][1] > (rel_ymax<<shift)) P[3][1]= (rel_ymax<<shift);
@@ -721,10 +909,72 @@ int estimate_motion(MpegEncContext * s,
         }
         dmin = epzs_motion_search(s, &mx, &my, P, pred_x, pred_y, rel_xmin, rel_ymin, rel_xmax, rel_ymax);
  
-        mx+= s->mb_x*16;
-        my+= s->mb_y*16;
+        mx+= mb_x*16;
+        my+= mb_y*16;
         break;
     }
+    
+    if(s->flags&CODEC_FLAG_4MV){
+        int block;
+
+        mb_type|= MB_TYPE_INTER4V;
+
+        for(block=0; block<4; block++){
+            int mx4, my4;
+            int pred_x4, pred_y4;
+            int dmin4;
+            static const int off[4]= {2, 1, 1, -1};
+            const int mot_stride = s->block_wrap[0];
+            const int mot_xy = s->block_index[block];
+            const int block_x= mb_x*2 + (block&1);
+            const int block_y= mb_y*2 + (block>>1);
+
+            const int rel_xmin4= xmin - block_x*8;
+            const int rel_xmax4= xmax - block_x*8;
+            const int rel_ymin4= ymin - block_y*8;
+            const int rel_ymax4= ymax - block_y*8;
+
+            P[0][0] = s->motion_val[mot_xy    ][0];
+            P[0][1] = s->motion_val[mot_xy    ][1];
+            P[1][0] = s->motion_val[mot_xy - 1][0];
+            P[1][1] = s->motion_val[mot_xy - 1][1];
+            if(P[1][0] > (rel_xmax4<<shift)) P[1][0]= (rel_xmax4<<shift);
+
+            /* special case for first line */
+            if ((mb_y == 0 || s->first_slice_line || s->first_gob_line) && block<2) {
+                P[4][0] = P[1][0];
+                P[4][1] = P[1][1];
+            } else {
+                P[2][0] = s->motion_val[mot_xy - mot_stride             ][0];
+                P[2][1] = s->motion_val[mot_xy - mot_stride             ][1];
+                P[3][0] = s->motion_val[mot_xy - mot_stride + off[block]][0];
+                P[3][1] = s->motion_val[mot_xy - mot_stride + off[block]][1];
+                if(P[2][1] > (rel_ymax4<<shift)) P[2][1]= (rel_ymax4<<shift);
+                if(P[3][0] < (rel_xmin4<<shift)) P[3][0]= (rel_xmin4<<shift);
+                if(P[3][1] > (rel_ymax4<<shift)) P[3][1]= (rel_ymax4<<shift);
+        
+                P[4][0]= mid_pred(P[1][0], P[2][0], P[3][0]);
+                P[4][1]= mid_pred(P[1][1], P[2][1], P[3][1]);
+            }
+            if(s->out_format == FMT_H263){
+                pred_x4 = P[4][0];
+                pred_y4 = P[4][1];
+            }else { /* mpeg1 at least */
+                pred_x4= P[1][0];
+                pred_y4= P[1][1];
+            }
+            P[5][0]= mx - mb_x*16;
+            P[5][1]= my - mb_y*16;
+
+            dmin4 = epzs_motion_search4(s, block, &mx4, &my4, P, pred_x4, pred_y4, rel_xmin4, rel_ymin4, rel_xmax4, rel_ymax4);
+
+            halfpel_motion_search4(s, &mx4, &my4, dmin4, rel_xmin4, rel_ymin4, rel_xmax4, rel_ymax4, 
+                                   pred_x4, pred_y4, block_x, block_y);
+     
+            s->motion_val[ s->block_index[block] ][0]= mx4;
+            s->motion_val[ s->block_index[block] ][1]= my4;
+        }
+    }
 
     /* intra / predictive decision */
     xx = mb_x * 16;
@@ -737,7 +987,7 @@ int estimate_motion(MpegEncContext * s,
     sum = pix_sum(pix, s->linesize);
 #if 0
     varc = pix_dev(pix, s->linesize, (sum+128)>>8) + INTER_BIAS;
-    vard = pix_abs16x16(pix, ppix, s->linesize, 16);
+    vard = pix_abs16x16(pix, ppix, s->linesize);
 #else
     sum= (sum+8)>>4;
     varc = ((pix_norm1(pix, s->linesize) - sum*sum + 128 + 500)>>8);
@@ -745,30 +995,38 @@ int estimate_motion(MpegEncContext * s,
 #endif
 
     s->mb_var[s->mb_width * mb_y + mb_x] = varc;
-    s->avg_mb_var += varc;
+    s->avg_mb_var+= varc;
     s->mc_mb_var += vard;
 
 #if 0
     printf("varc=%4d avg_var=%4d (sum=%4d) vard=%4d mx=%2d my=%2d\n",
 	   varc, s->avg_mb_var, sum, vard, mx - xx, my - yy);
 #endif
-    if (vard <= 64 || vard < varc) {
-        if (s->full_search != ME_ZERO) {
+    if(s->flags&CODEC_FLAG_HQ){
+        if (vard*2 + 200 > varc)
+            mb_type|= MB_TYPE_INTRA;
+        if (varc*2 + 200 > vard){
+            mb_type|= MB_TYPE_INTER;
             halfpel_motion_search(s, &mx, &my, dmin, xmin, ymin, xmax, ymax, pred_x, pred_y);
-        } else {
-            mx -= 16 * s->mb_x;
-            my -= 16 * s->mb_y;
         }
-//        check(mx + 32*s->mb_x, my + 32*s->mb_y, 1, end)
-
-	*mx_ptr = mx;
-	*my_ptr = my;
-	return 0;
-    } else {
-	*mx_ptr = 0;
-	*my_ptr = 0;
-	return 1;
+    }else{
+        if (vard <= 64 || vard < varc) {
+            mb_type|= MB_TYPE_INTER;
+            if (s->full_search != ME_ZERO) {
+                halfpel_motion_search(s, &mx, &my, dmin, xmin, ymin, xmax, ymax, pred_x, pred_y);
+            } else {
+                mx -= 16 * mb_x;
+                my -= 16 * mb_y;
+            }
+        }else{
+            mb_type|= MB_TYPE_INTRA;
+            mx = 0;//mx*2 - 32 * mb_x;
+            my = 0;//my*2 - 32 * mb_y;
+        }
     }
+
+    s->mb_type[mb_y*s->mb_width + mb_x]= mb_type;
+    set_mv_tables(s, mx, my);
 }
 
 #else
diff --git a/libavcodec/mpegvideo.c b/libavcodec/mpegvideo.c
index a4d649cfe5..c06f51e8af 100644
--- a/libavcodec/mpegvideo.c
+++ b/libavcodec/mpegvideo.c
@@ -227,6 +227,8 @@ int MPV_common_init(MpegEncContext *s)
         if (!s->mbskip_table)
             goto fail;
     }
+    
+    s->block= s->intra_block;
 
     s->context_initialized = 1;
     return 0;
@@ -295,7 +297,7 @@ int MPV_encode_init(AVCodecContext *avctx)
     s->qblur= avctx->qblur;
     s->avctx = avctx;
     s->aspect_ratio_info= avctx->aspect_ratio_info;
-    s->hq= (avctx->flags & CODEC_FLAG_HQ);
+    s->flags= avctx->flags;
     
     if (s->gop_size <= 1) {
         s->intra_only = 1;
@@ -1078,68 +1080,183 @@ void MPV_decode_mb(MpegEncContext *s, DCTELEM block[6][64])
         }
     }
  the_end:
-    emms_c();
+    emms_c(); //FIXME remove
 }
 
-static void encode_picture(MpegEncContext *s, int picture_number)
+static void encode_mb(MpegEncContext *s)
 {
-    int mb_x, mb_y, wrap, last_gob, pdif = 0;
+    int wrap;
+    const int mb_x= s->mb_x;
+    const int mb_y= s->mb_y;
     UINT8 *ptr;
-    int i, motion_x, motion_y;
+    const int motion_x= s->mv[0][0][0];
+    const int motion_y= s->mv[0][0][1];
+    int i;
+
+    /* get the pixels */
+    wrap = s->linesize;
+    ptr = s->new_picture[0] + (mb_y * 16 * wrap) + mb_x * 16;
+    get_pixels(s->block[0], ptr, wrap);
+    get_pixels(s->block[1], ptr + 8, wrap);
+    get_pixels(s->block[2], ptr + 8 * wrap, wrap);
+    get_pixels(s->block[3], ptr + 8 * wrap + 8, wrap);
+    wrap = s->linesize >> 1;
+    ptr = s->new_picture[1] + (mb_y * 8 * wrap) + mb_x * 8;
+    get_pixels(s->block[4], ptr, wrap);
+
+    wrap = s->linesize >> 1;
+    ptr = s->new_picture[2] + (mb_y * 8 * wrap) + mb_x * 8;
+    get_pixels(s->block[5], ptr, wrap);
+
+    /* subtract previous frame if non intra */
+    if (!s->mb_intra) {
+        int dxy, offset, mx, my;
+
+        dxy = ((motion_y & 1) << 1) | (motion_x & 1);
+        ptr = s->last_picture[0] + 
+            ((mb_y * 16 + (motion_y >> 1)) * s->linesize) + 
+            (mb_x * 16 + (motion_x >> 1));
+
+        sub_pixels_2(s->block[0], ptr, s->linesize, dxy);
+        sub_pixels_2(s->block[1], ptr + 8, s->linesize, dxy);
+        sub_pixels_2(s->block[2], ptr + s->linesize * 8, s->linesize, dxy);
+        sub_pixels_2(s->block[3], ptr + 8 + s->linesize * 8, s->linesize ,dxy);
+
+        if (s->out_format == FMT_H263) {
+            /* special rounding for h263 */
+            dxy = 0;
+            if ((motion_x & 3) != 0)
+                dxy |= 1;
+            if ((motion_y & 3) != 0)
+                dxy |= 2;
+            mx = motion_x >> 2;
+            my = motion_y >> 2;
+        } else {
+            mx = motion_x / 2;
+            my = motion_y / 2;
+            dxy = ((my & 1) << 1) | (mx & 1);
+            mx >>= 1;
+            my >>= 1;
+        }
+        offset = ((mb_y * 8 + my) * (s->linesize >> 1)) + (mb_x * 8 + mx);
+        ptr = s->last_picture[1] + offset;
+        sub_pixels_2(s->block[4], ptr, s->linesize >> 1, dxy);
+        ptr = s->last_picture[2] + offset;
+        sub_pixels_2(s->block[5], ptr, s->linesize >> 1, dxy);
+    }
+            
+#if 0
+            {
+                float adap_parm;
+                
+                adap_parm = ((s->avg_mb_var << 1) + s->mb_var[s->mb_width*mb_y+mb_x] + 1.0) /
+                            ((s->mb_var[s->mb_width*mb_y+mb_x] << 1) + s->avg_mb_var + 1.0);
+            
+                printf("\ntype=%c qscale=%2d adap=%0.2f dquant=%4.2f var=%4d avgvar=%4d", 
+                        (s->mb_type[s->mb_width*mb_y+mb_x] > 0) ? 'I' : 'P', 
+                        s->qscale, adap_parm, s->qscale*adap_parm,
+                        s->mb_var[s->mb_width*mb_y+mb_x], s->avg_mb_var);
+            }
+#endif
+    /* DCT & quantize */
+    if (s->h263_msmpeg4) {
+        msmpeg4_dc_scale(s);
+    } else if (s->h263_pred) {
+        h263_dc_scale(s);
+    } else {
+        /* default quantization values */
+        s->y_dc_scale = 8;
+        s->c_dc_scale = 8;
+    }
+    for(i=0;i<6;i++) {
+        s->block_last_index[i] = dct_quantize(s, s->block[i], i, s->qscale);
+    }
+
+    /* huffman encode */
+    switch(s->out_format) {
+    case FMT_MPEG1:
+        mpeg1_encode_mb(s, s->block, motion_x, motion_y);
+        break;
+    case FMT_H263:
+        if (s->h263_msmpeg4)
+            msmpeg4_encode_mb(s, s->block, motion_x, motion_y);
+        else if(s->h263_pred)
+            mpeg4_encode_mb(s, s->block, motion_x, motion_y);
+        else
+            h263_encode_mb(s, s->block, motion_x, motion_y);
+        break;
+    case FMT_MJPEG:
+        mjpeg_encode_mb(s, s->block);
+        break;
+    }
+}
+
+static void copy_bits(PutBitContext *pb, UINT8 *src, int length)
+{
+    int bytes= length>>3;
+    int bits= length&7;
+    int i;
+
+    for(i=0; i<bytes; i++) put_bits(pb, 8, src[i]);
+    put_bits(pb, bits, src[i]>>(8-bits));
+}
+
+static void encode_picture(MpegEncContext *s, int picture_number)
+{
+    int mb_x, mb_y, last_gob, pdif = 0;
+    int i;
     int bits;
+    MpegEncContext best_s;
+    UINT8 bit_buf[4][3000]; //FIXME check that this is ALLWAYS large enogh for a MB
 
     s->picture_number = picture_number;
 
+    s->block_wrap[0]=
+    s->block_wrap[1]=
+    s->block_wrap[2]=
+    s->block_wrap[3]= s->mb_width*2 + 2;
+    s->block_wrap[4]=
+    s->block_wrap[5]= s->mb_width + 2;
+    
     s->last_mc_mb_var = s->mc_mb_var;
     /* Reset the average MB variance */
     s->avg_mb_var = 0;
     s->mc_mb_var = 0;
     /* Estimate motion for every MB */
-    for(mb_y=0; mb_y < s->mb_height; mb_y++) {
-        for(mb_x=0; mb_x < s->mb_width; mb_x++) {
-            int xy= mb_y * s->mb_width + mb_x;
-            const int mot_stride = s->mb_width*2 + 2;
-            int mot_xy = (mb_y*2 + 1)*mot_stride + mb_x*2 + 1;
-            s->mb_x = mb_x;
-            s->mb_y = mb_y;
-
-            /* compute motion vector and macro block type (intra or non intra) */
-            motion_x = 0;
-            motion_y = 0;
-            if (s->pict_type == P_TYPE) {
-                s->mb_intra = estimate_motion(s, mb_x, mb_y,
-                                              &motion_x,
-                                              &motion_y);
-            } else {
-                s->mb_intra = 1;
+    if(s->pict_type == P_TYPE){
+        for(mb_y=0; mb_y < s->mb_height; mb_y++) {
+            s->block_index[0]= s->block_wrap[0]*(mb_y*2 + 1) - 1;
+            s->block_index[1]= s->block_wrap[0]*(mb_y*2 + 1);
+            s->block_index[2]= s->block_wrap[0]*(mb_y*2 + 2) - 1;
+            s->block_index[3]= s->block_wrap[0]*(mb_y*2 + 2);
+            for(mb_x=0; mb_x < s->mb_width; mb_x++) {
+                s->mb_x = mb_x;
+                s->mb_y = mb_y;
+                s->block_index[0]+=2;
+                s->block_index[1]+=2;
+                s->block_index[2]+=2;
+                s->block_index[3]+=2;
+
+                /* compute motion vector & mb_type and store in context */
+                estimate_motion(s, mb_x, mb_y);
+//                s->mb_type[mb_y*s->mb_width + mb_x]=MB_TYPE_INTER;
             }
-            /* Store MB type and MV */
-            s->mb_type[xy] = s->mb_intra;
-            s->mv_table[0][xy] = motion_x;
-            s->mv_table[1][xy] = motion_y;
-
-            s->motion_val[mot_xy  ][0]= motion_x;
-            s->motion_val[mot_xy  ][1]= motion_y;
-            s->motion_val[mot_xy+1][0]= motion_x;
-            s->motion_val[mot_xy+1][1]= motion_y;
-            mot_xy += mot_stride;
-            s->motion_val[mot_xy  ][0]= motion_x;
-            s->motion_val[mot_xy  ][1]= motion_y;
-            s->motion_val[mot_xy+1][0]= motion_x;
-            s->motion_val[mot_xy+1][1]= motion_y;
         }
+        emms_c();
+    }else{
+        /* I-Frame */
+        //FIXME do we need to zero them?
+        memset(s->motion_val[0], 0, sizeof(INT16)*(s->mb_width*2 + 2)*(s->mb_height*2 + 2)*2);
+        memset(s->mv_table[0]  , 0, sizeof(INT16)*s->mb_width*s->mb_height);
+        memset(s->mv_table[1]  , 0, sizeof(INT16)*s->mb_width*s->mb_height);
+        memset(s->mb_type      , MB_TYPE_INTRA, sizeof(UINT8)*s->mb_width*s->mb_height);
     }
-    emms_c();
 
     if(s->avg_mb_var < s->mc_mb_var && s->pict_type != B_TYPE){ //FIXME subtract MV bits
-        int i;
         s->pict_type= I_TYPE;
         s->picture_in_gop_number=0;
-        for(i=0; i<s->mb_num; i++){
-            s->mb_type[i] = 1;
-            s->mv_table[0][i] = 0;
-            s->mv_table[1][i] = 0;
-        }
+        memset(s->mb_type   , MB_TYPE_INTRA, sizeof(UINT8)*s->mb_width*s->mb_height);
+//printf("Scene change detected, encoding as I Frame\n");
     }
 
     /* find best f_code for ME which do unlimited searches */
@@ -1152,7 +1269,7 @@ static void encode_picture(MpegEncContext *s, int picture_number)
         for(i=0; i<8; i++) mv_num[i]=0;
 
         for(i=0; i<s->mb_num; i++){
-            if(s->mb_type[i] == 0){
+            if(s->mb_type[i] & (MB_TYPE_INTER|MB_TYPE_INTER4V)){
                 mv_num[ fcode_tab[s->mv_table[0][i] + MAX_MV] ]++;
                 mv_num[ fcode_tab[s->mv_table[1][i] + MAX_MV] ]++;
 //printf("%d %d %d\n", s->mv_table[0][i], fcode_tab[s->mv_table[0][i] + MAX_MV], i);
@@ -1181,16 +1298,20 @@ static void encode_picture(MpegEncContext *s, int picture_number)
         UINT8 * fcode_tab= s->fcode_tab;
 
         for(i=0; i<s->mb_num; i++){
-            if(s->mb_type[i] == 0){
+            if(s->mb_type[i]&MB_TYPE_INTER){
                 if(   fcode_tab[s->mv_table[0][i] + MAX_MV] > f_code
                    || fcode_tab[s->mv_table[0][i] + MAX_MV] == 0
                    || fcode_tab[s->mv_table[1][i] + MAX_MV] > f_code
                    || fcode_tab[s->mv_table[1][i] + MAX_MV] == 0 ){
-                    s->mb_type[i] = 1;
+                    s->mb_type[i] &= ~MB_TYPE_INTER;
+                    s->mb_type[i] |= MB_TYPE_INTRA;
                     s->mv_table[0][i] = 0;
                     s->mv_table[1][i] = 0;
                 }
             }
+            if(s->mb_type[i]&MB_TYPE_INTER4V){
+              //FIXME
+            }
         }
     }
 
@@ -1249,8 +1370,6 @@ static void encode_picture(MpegEncContext *s, int picture_number)
     s->mb_incr = 1;
     s->last_mv[0][0][0] = 0;
     s->last_mv[0][0][1] = 0;
-    s->mv_type = MV_TYPE_16X16;
-    s->mv_dir = MV_DIR_FORWARD;
 
     /* Get the GOB height based on picture height */
     if (s->out_format == FMT_H263 && !s->h263_pred && !s->h263_msmpeg4) {
@@ -1264,12 +1383,6 @@ static void encode_picture(MpegEncContext *s, int picture_number)
         
     s->avg_mb_var = s->avg_mb_var / s->mb_num;        
     
-    s->block_wrap[0]=
-    s->block_wrap[1]=
-    s->block_wrap[2]=
-    s->block_wrap[3]= s->mb_width*2 + 2;
-    s->block_wrap[4]=
-    s->block_wrap[5]= s->mb_width + 2;
     for(mb_y=0; mb_y < s->mb_height; mb_y++) {
         /* Put GOB header based on RTP MTU */
         /* TODO: Put all this stuff in a separate generic function */
@@ -1292,6 +1405,11 @@ static void encode_picture(MpegEncContext *s, int picture_number)
         s->block_index[4]= s->block_wrap[4]*(mb_y + 1)                    + s->block_wrap[0]*(s->mb_height*2 + 2);
         s->block_index[5]= s->block_wrap[4]*(mb_y + 1 + s->mb_height + 2) + s->block_wrap[0]*(s->mb_height*2 + 2);
         for(mb_x=0; mb_x < s->mb_width; mb_x++) {
+            const int mb_type= s->mb_type[mb_y * s->mb_width + mb_x];
+            PutBitContext pb;
+            int d;
+            int dmin=10000000;
+            int best=0;
 
             s->mb_x = mb_x;
             s->mb_y = mb_y;
@@ -1301,124 +1419,78 @@ static void encode_picture(MpegEncContext *s, int picture_number)
             s->block_index[3]+=2;
             s->block_index[4]++;
             s->block_index[5]++;
-#if 0
-            /* compute motion vector and macro block type (intra or non intra) */
-            motion_x = 0;
-            motion_y = 0;
-            if (s->pict_type == P_TYPE) {
-                s->mb_intra = estimate_motion(s, mb_x, mb_y,
-                                              &motion_x,
-                                              &motion_y);
-            } else {
-                s->mb_intra = 1;
-            }
-#endif
 
-            s->mb_intra = s->mb_type[mb_y * s->mb_width + mb_x];
-            motion_x = s->mv_table[0][mb_y * s->mb_width + mb_x];
-            motion_y = s->mv_table[1][mb_y * s->mb_width + mb_x];
-            
-            /* get the pixels */
-            wrap = s->linesize;
-            ptr = s->new_picture[0] + (mb_y * 16 * wrap) + mb_x * 16;
-            get_pixels(s->block[0], ptr, wrap);
-            get_pixels(s->block[1], ptr + 8, wrap);
-            get_pixels(s->block[2], ptr + 8 * wrap, wrap);
-            get_pixels(s->block[3], ptr + 8 * wrap + 8, wrap);
-            wrap = s->linesize >> 1;
-            ptr = s->new_picture[1] + (mb_y * 8 * wrap) + mb_x * 8;
-            get_pixels(s->block[4], ptr, wrap);
-
-            wrap = s->linesize >> 1;
-            ptr = s->new_picture[2] + (mb_y * 8 * wrap) + mb_x * 8;
-            get_pixels(s->block[5], ptr, wrap);
-
-            /* subtract previous frame if non intra */
-            if (!s->mb_intra) {
-                int dxy, offset, mx, my;
-
-                dxy = ((motion_y & 1) << 1) | (motion_x & 1);
-                ptr = s->last_picture[0] + 
-                    ((mb_y * 16 + (motion_y >> 1)) * s->linesize) + 
-                    (mb_x * 16 + (motion_x >> 1));
-
-                sub_pixels_2(s->block[0], ptr, s->linesize, dxy);
-                sub_pixels_2(s->block[1], ptr + 8, s->linesize, dxy);
-                sub_pixels_2(s->block[2], ptr + s->linesize * 8, s->linesize, dxy);
-                sub_pixels_2(s->block[3], ptr + 8 + s->linesize * 8, s->linesize ,dxy);
-
-                if (s->out_format == FMT_H263) {
-                    /* special rounding for h263 */
-                    dxy = 0;
-                    if ((motion_x & 3) != 0)
-                        dxy |= 1;
-                    if ((motion_y & 3) != 0)
-                        dxy |= 2;
-                    mx = motion_x >> 2;
-                    my = motion_y >> 2;
-                } else {
-                    mx = motion_x / 2;
-                    my = motion_y / 2;
-                    dxy = ((my & 1) << 1) | (mx & 1);
-                    mx >>= 1;
-                    my >>= 1;
+            s->mv_type = MV_TYPE_16X16;
+            s->mv_dir = MV_DIR_FORWARD;
+            if(mb_type & (mb_type-1)){ // more than 1 MB type possible
+                pb= s->pb;
+                if(mb_type&MB_TYPE_INTER){
+                    s->mb_intra= 0;
+                    s->mv[0][0][0] = s->mv_table[0][mb_y * s->mb_width + mb_x];
+                    s->mv[0][0][1] = s->mv_table[1][mb_y * s->mb_width + mb_x];
+                    init_put_bits(&s->pb, bit_buf[1], 3000, NULL, NULL);
+                    s->block= s->inter_block;
+
+                    encode_mb(s);
+                    d= get_bit_count(&s->pb);
+                    if(d<dmin){
+                        flush_put_bits(&s->pb);
+                        dmin=d;
+                        best_s.mv[0][0][0]= s->mv[0][0][0];
+                        best_s.mv[0][0][1]= s->mv[0][0][1];
+                        best_s.mb_intra= 0;
+                        best_s.pb=s->pb;
+                        best_s.block= s->block;
+                        best=1;
+                        for(i=0; i<6; i++)
+                            best_s.block_last_index[i]= s->block_last_index[i];
+                    }
                 }
-                offset = ((mb_y * 8 + my) * (s->linesize >> 1)) + (mb_x * 8 + mx);
-                ptr = s->last_picture[1] + offset;
-                sub_pixels_2(s->block[4], ptr, s->linesize >> 1, dxy);
-                ptr = s->last_picture[2] + offset;
-                sub_pixels_2(s->block[5], ptr, s->linesize >> 1, dxy);
-            }
-            emms_c();
-            
-#if 0
-            {
-                float adap_parm;
-                
-                adap_parm = ((s->avg_mb_var << 1) + s->mb_var[s->mb_width*mb_y+mb_x] + 1.0) /
-                            ((s->mb_var[s->mb_width*mb_y+mb_x] << 1) + s->avg_mb_var + 1.0);
-            
-                printf("\ntype=%c qscale=%2d adap=%0.2f dquant=%4.2f var=%4d avgvar=%4d", 
-                        (s->mb_type[s->mb_width*mb_y+mb_x] > 0) ? 'I' : 'P', 
-                        s->qscale, adap_parm, s->qscale*adap_parm,
-                        s->mb_var[s->mb_width*mb_y+mb_x], s->avg_mb_var);
-            }
-#endif
-            /* DCT & quantize */
-            if (s->h263_msmpeg4) {
-                msmpeg4_dc_scale(s);
-            } else if (s->h263_pred) {
-                h263_dc_scale(s);
+                if(mb_type&MB_TYPE_INTRA){
+                    s->mb_intra= 1;
+                    s->mv[0][0][0] = 0;
+                    s->mv[0][0][1] = 0;
+                    init_put_bits(&s->pb, bit_buf[0], 3000, NULL, NULL);
+                    s->block= s->intra_block;
+                   
+                    encode_mb(s);
+                    d= get_bit_count(&s->pb);
+                    if(d<dmin){
+                        flush_put_bits(&s->pb);
+                        dmin=d;
+                        best_s.mv[0][0][0]= 0;
+                        best_s.mv[0][0][1]= 0;
+                        best_s.mb_intra= 1;
+                        best_s.pb=s->pb;
+                        best_s.block= s->block;
+                        for(i=0; i<6; i++)
+                            best_s.block_last_index[i]= s->block_last_index[i];
+                        best=0;
+                    }
+                    /* force cleaning of ac/dc if needed ... */
+                    s->mbintra_table[mb_x + mb_y*s->mb_width]=1;
+                }
+                s->mv[0][0][0]= best_s.mv[0][0][0];
+                s->mv[0][0][1]= best_s.mv[0][0][1];
+                s->mb_intra= best_s.mb_intra;
+                for(i=0; i<6; i++)
+                   s->block_last_index[i]= best_s.block_last_index[i];
+                copy_bits(&pb, bit_buf[best], dmin);
+                s->block= best_s.block;
+                s->pb= pb;
             } else {
-                /* default quantization values */
-                s->y_dc_scale = 8;
-                s->c_dc_scale = 8;
-            }
-            for(i=0;i<6;i++) {
-                s->block_last_index[i] = dct_quantize(s, s->block[i], i, s->qscale);
-            }
-
-            /* huffman encode */
-            switch(s->out_format) {
-            case FMT_MPEG1:
-                mpeg1_encode_mb(s, s->block, motion_x, motion_y);
-                break;
-            case FMT_H263:
-                if (s->h263_msmpeg4)
-                    msmpeg4_encode_mb(s, s->block, motion_x, motion_y);
-                else if(s->h263_pred)
-                    mpeg4_encode_mb(s, s->block, motion_x, motion_y);
-                else
-                    h263_encode_mb(s, s->block, motion_x, motion_y);
-                break;
-            case FMT_MJPEG:
-                mjpeg_encode_mb(s, s->block);
-                break;
+                // only one MB-Type possible
+                if(mb_type&MB_TYPE_INTRA){
+                    s->mb_intra= 1;
+                    s->mv[0][0][0] = 0;
+                    s->mv[0][0][1] = 0;
+                }else{
+                    s->mb_intra= 0;
+                    s->mv[0][0][0] = s->mv_table[0][mb_y * s->mb_width + mb_x];
+                    s->mv[0][0][1] = s->mv_table[1][mb_y * s->mb_width + mb_x];
+                }
+                encode_mb(s);
             }
-            
-            /* decompress blocks so that we keep the state of the decoder */
-            s->mv[0][0][0] = motion_x;
-            s->mv[0][0][1] = motion_y;
 
             MPV_decode_mb(s, s->block);
         }
@@ -1437,6 +1509,7 @@ static void encode_picture(MpegEncContext *s, int picture_number)
             s->first_gob_line = 0;
         }
     }
+    emms_c();
 
     if (s->h263_msmpeg4 && s->pict_type == I_TYPE)
         msmpeg4_encode_ext_header(s);
@@ -1454,7 +1527,6 @@ static void encode_picture(MpegEncContext *s, int picture_number)
         s->ptr_lastgob = pbBufPtr(&s->pb);
         //fprintf(stderr,"\nGOB: %2d size: %d (last)", s->gob_number, pdif);
     }
-
 }
 
 static int dct_quantize_c(MpegEncContext *s, 
diff --git a/libavcodec/mpegvideo.h b/libavcodec/mpegvideo.h
index 49c36bec21..03e9eaf550 100644
--- a/libavcodec/mpegvideo.h
+++ b/libavcodec/mpegvideo.h
@@ -65,7 +65,7 @@ typedef struct MpegEncContext {
     int qmax;         /* max qscale */
     int max_qdiff;    /* max qscale difference between frames */
     int encoding;     /* true if we are encoding (vs decoding) */
-    int hq;           /* set if CODEC_FLAG_HQ is used in AVCodecContext.flags */
+    int flags;        /* AVCodecContext.flags (HQ, MV4, ...) */
     /* the following fields are managed internally by the encoder */
 
     /* bit output */
@@ -141,8 +141,16 @@ typedef struct MpegEncContext {
     int mb_x, mb_y;
     int mb_incr;
     int mb_intra;
-    INT16 *mb_var;      /* Table for MB variances */
-    char *mb_type;    /* Table for MB type */
+    UINT16 *mb_var;    /* Table for MB variances */
+    UINT8 *mb_type;    /* Table for MB type */
+#define MB_TYPE_INTRA    0x01
+#define MB_TYPE_INTER    0x02
+#define MB_TYPE_INTER4V  0x04
+#define MB_TYPE_SKIPED   0x08
+#define MB_TYPE_DIRECT   0x10
+#define MB_TYPE_FORWARD  0x20
+#define MB_TYPE_BACKWAD  0x40
+#define MB_TYPE_BIDIR    0x80
 
     int block_index[6];
     int block_wrap[6];
@@ -295,7 +303,10 @@ typedef struct MpegEncContext {
     UINT8 *ptr_last_mb_line;
     UINT32 mb_line_avgsize;
     
-    DCTELEM block[6][64] __align8;
+    DCTELEM (*block)[64]; /* points to one of the following blocks */
+    DCTELEM intra_block[6][64] __align8;
+    DCTELEM inter_block[6][64] __align8;
+    DCTELEM inter4v_block[6][64] __align8;
     void (*dct_unquantize)(struct MpegEncContext *s, 
                            DCTELEM *block, int n, int qscale);
 } MpegEncContext;
@@ -311,9 +322,8 @@ void MPV_common_init_mmx(MpegEncContext *s);
 
 /* motion_est.c */
 
-int estimate_motion(MpegEncContext *s, 
-                    int mb_x, int mb_y,
-                    int *mx_ptr, int *my_ptr);
+void estimate_motion(MpegEncContext *s, 
+                    int mb_x, int mb_y);
 
 /* mpeg12.c */
 extern INT16 default_intra_matrix[64];