much better ME for b frames (a bit slow though)

fixed MC rounding for b frames fixed hq mode with b-frames Originally committed as revision 406 to svn://svn.ffmpeg.org/ffmpeg/trunk
23 years ago · 91029be790
parent 1f0cd30fd9
commit 91029be790
5 changed files with 446 additions and 94 deletions
--- a/libavcodec/common.h
+++ b/libavcodec/common.h
@ -881,6 +881,16 @@ static inline int mid_pred(int a, int b, int c)
    return a + b + c - vmin - vmax;
 }

+static inline int clip(int a, int amin, int amax)
+{
+    if (a < amin)
+        return amin;
+    else if (a > amax)
+        return amax;
+    else
+        return a;
+}
+
 /* memory */
 void *av_mallocz(int size);

--- a/libavcodec/h263.c
+++ b/libavcodec/h263.c
@ -18,7 +18,7 @@
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 *
- * ac prediction encoding by Michael Niedermayer <michaelni@gmx.at>
+ * ac prediction encoding & b-frame support by Michael Niedermayer <michaelni@gmx.at>
 */
 #include "common.h"
 #include "dsputil.h"
@ -282,7 +282,7 @@ void mpeg4_encode_mb(MpegEncContext * s,
                s->mv[0][0][1]= 
                s->mv[1][0][0]= 
                s->mv[1][0][1]= 0;
-//                s->mv_dir= MV_DIR_FORWARD; //doesnt matter
+                s->mv_dir= MV_DIR_FORWARD; //doesnt matter
                return;
            }

@ -334,7 +334,8 @@ void mpeg4_encode_mb(MpegEncContext * s,
                s->last_mv[0][0][0]= motion_x;
                s->last_mv[0][0][1]= motion_y;
                break;
-            default: 
+            default:
+                printf("unknown mb type\n");
                return;
            }
            bits= get_bit_count(&s->pb);
@ -959,6 +960,31 @@ static void put_string(PutBitContext * pbc, char *s)
    put_bits(pbc, 8, 0);
 }

+/* must be called before writing the header */
+void ff_set_mpeg4_time(MpegEncContext * s, int picture_number){
+    int time_div, time_mod;
+
+    if(s->pict_type==I_TYPE){ //we will encode a vol header
+        s->time_increment_resolution= s->frame_rate/ff_gcd(s->frame_rate, FRAME_RATE_BASE);
+        if(s->time_increment_resolution>=256*256) s->time_increment_resolution= 256*128;
+
+        s->time_increment_bits = av_log2(s->time_increment_resolution - 1) + 1;
+    }
+
+    s->time= picture_number*(int64_t)FRAME_RATE_BASE*s->time_increment_resolution/s->frame_rate;
+    time_div= s->time/s->time_increment_resolution;
+    time_mod= s->time%s->time_increment_resolution;
+
+    if(s->pict_type==B_TYPE){
+        s->bp_time= s->last_non_b_time - s->time;
+    }else{
+        s->last_time_base= s->time_base;
+        s->time_base= time_div;
+        s->pp_time= s->time - s->last_non_b_time;
+        s->last_non_b_time= s->time;
+    }
+}
+
 static void mpeg4_encode_vol_header(MpegEncContext * s)
 {
    int vo_ver_id=1; //must be 2 if we want GMC or q-pel
@ -983,11 +1009,7 @@ static void mpeg4_encode_vol_header(MpegEncContext * s)
    put_bits(&s->pb, 2, RECT_SHAPE);	/* vol shape= rectangle */
    put_bits(&s->pb, 1, 1);		/* marker bit */
    
-    s->time_increment_resolution= s->frame_rate/ff_gcd(s->frame_rate, FRAME_RATE_BASE);
-    if(s->time_increment_resolution>=256*256) s->time_increment_resolution= 256*128;
-
    put_bits(&s->pb, 16, s->time_increment_resolution);
-    s->time_increment_bits = av_log2(s->time_increment_resolution - 1) + 1;
    if (s->time_increment_bits < 1)
        s->time_increment_bits = 1;
    put_bits(&s->pb, 1, 1);		/* marker bit */
@ -1034,9 +1056,6 @@ void mpeg4_encode_picture_header(MpegEncContext * s, int picture_number)
    
    if(s->pict_type==I_TYPE) mpeg4_encode_vol_header(s);
    
-    s->time= s->picture_number*(int64_t)FRAME_RATE_BASE*s->time_increment_resolution/s->frame_rate;
-    time_div= s->time/s->time_increment_resolution;
-    time_mod= s->time%s->time_increment_resolution;
 //printf("num:%d rate:%d base:%d\n", s->picture_number, s->frame_rate, FRAME_RATE_BASE);
    
    if(get_bit_count(&s->pb)!=0) mpeg4_stuffing(&s->pb);
@ -1044,15 +1063,8 @@ void mpeg4_encode_picture_header(MpegEncContext * s, int picture_number)
    put_bits(&s->pb, 16, 0x1B6);	/* vop header */
    put_bits(&s->pb, 2, s->pict_type - 1);	/* pict type: I = 0 , P = 1 */

-    if(s->pict_type==B_TYPE){
-        s->bp_time= s->last_non_b_time - s->time;
-    }else{
-        s->last_time_base= s->time_base;
-        s->time_base= time_div;
-        s->pp_time= s->time - s->last_non_b_time;
-        s->last_non_b_time= s->time;
-    }
-
+    time_div= s->time/s->time_increment_resolution;
+    time_mod= s->time%s->time_increment_resolution;
    time_incr= time_div - s->last_time_base;
    while(time_incr--)
        put_bits(&s->pb, 1, 1);
@ -1770,6 +1782,7 @@ int h263_decode_mb(MpegEncContext *s,
            s->last_mv[0][0][1]= 
            s->last_mv[1][0][0]= 
            s->last_mv[1][0][1]= 0;
+//            printf("\n");
        }

        /* if we skipped it in the future P Frame than skip it now too */
@ -1789,6 +1802,7 @@ int h263_decode_mb(MpegEncContext *s,
 //FIXME is this correct?
 /*            s->last_mv[0][0][0]=
            s->last_mv[0][0][1]=0;*/
+//            printf("S");
            return 0;
        }

@ -1837,6 +1851,7 @@ int h263_decode_mb(MpegEncContext *s,
            s->mv[0][0][1] = 
            s->mv[1][0][0] = 
            s->mv[1][0][1] = 1000;*/
+//            printf("D");
            break;
        case 1: 
            s->mv_dir = MV_DIR_FORWARD | MV_DIR_BACKWARD;
@ -1849,6 +1864,7 @@ int h263_decode_mb(MpegEncContext *s,
            my = h263_decode_motion(s, s->last_mv[1][0][1], s->b_code);
            s->last_mv[1][0][0]= s->mv[1][0][0] = mx;
            s->last_mv[1][0][1]= s->mv[1][0][1] = my;
+//            printf("I");
            break;
        case 2: 
            s->mv_dir = MV_DIR_BACKWARD;
@ -1856,6 +1872,7 @@ int h263_decode_mb(MpegEncContext *s,
            my = h263_decode_motion(s, s->last_mv[1][0][1], s->b_code);
            s->last_mv[1][0][0]= s->mv[1][0][0] = mx;
            s->last_mv[1][0][1]= s->mv[1][0][1] = my;
+//            printf("B");
            break;
        case 3:
            s->mv_dir = MV_DIR_FORWARD;
@ -1863,6 +1880,7 @@ int h263_decode_mb(MpegEncContext *s,
            my = h263_decode_motion(s, s->last_mv[0][0][1], s->f_code);
            s->last_mv[0][0][0]= s->mv[0][0][0] = mx;
            s->last_mv[0][0][1]= s->mv[0][0][1] = my;
+//            printf("F");
            break;
        default: return -1;
        }
--- a/libavcodec/motion_est.c
+++ b/libavcodec/motion_est.c
@ -29,7 +29,7 @@
 #define MAX(a,b) ((a) > (b) ? (a) : (b))
 #define INTER_BIAS	257

-static void halfpel_motion_search(MpegEncContext * s,
+static int halfpel_motion_search(MpegEncContext * s,
 				  int *mx_ptr, int *my_ptr, int dmin,
 				  int xmin, int ymin, int xmax, int ymax,
                                  int pred_x, int pred_y, uint8_t *ref_picture);
@ -673,7 +673,7 @@ static int epzs_motion_search4(MpegEncContext * s, int block,
    
 /* The idea would be to make half pel ME after Inter/Intra decision to 
   save time. */
-static inline void halfpel_motion_search(MpegEncContext * s,
+static inline int halfpel_motion_search(MpegEncContext * s,
 				  int *mx_ptr, int *my_ptr, int dmin,
 				  int xmin, int ymin, int xmax, int ymax,
                                  int pred_x, int pred_y, uint8_t *ref_picture)
@ -702,7 +702,7 @@ static inline void halfpel_motion_search(MpegEncContext * s,
        if(dmin < Z_THRESHOLD && mx==0 && my==0){
            *mx_ptr = 0;
            *my_ptr = 0;
-            return;
+            return dmin;
        }
        
        pen_x= pred_x + mx;
@ -727,6 +727,7 @@ static inline void halfpel_motion_search(MpegEncContext * s,

    *mx_ptr = mx;
    *my_ptr = my;
+    return dminh;
 }

 static inline void halfpel_motion_search4(MpegEncContext * s,
@ -1044,17 +1045,15 @@ void ff_estimate_p_frame_motion(MpegEncContext * s,
    set_p_mv_tables(s, mx, my);
 }

-void ff_estimate_motion_b(MpegEncContext * s,
+int ff_estimate_motion_b(MpegEncContext * s,
                       int mb_x, int mb_y, int16_t (*mv_table)[2], uint8_t *ref_picture, int f_code)
 {
-    UINT8 *pix, *ppix;
-    int sum, varc, vard, mx, my, range, dmin, xx, yy;
+    int mx, my, range, dmin;
    int xmin, ymin, xmax, ymax;
    int rel_xmin, rel_ymin, rel_xmax, rel_ymax;
    int pred_x=0, pred_y=0;
    int P[6][2];
    const int shift= 1+s->quarter_sample;
-    int mb_type=0;
    const int mot_stride = s->mb_width + 2;
    const int mot_xy = (mb_y + 1)*mot_stride + mb_x + 1;
    
@ -1124,18 +1123,210 @@ void ff_estimate_motion_b(MpegEncContext * s,
    /* At this point (mx,my) are full-pell and the absolute displacement */
 //    ppix = ref_picture + (my * s->linesize) + mx;
    
-    halfpel_motion_search(s, &mx, &my, dmin, xmin, ymin, xmax, ymax, pred_x, pred_y, ref_picture);
+    dmin= halfpel_motion_search(s, &mx, &my, dmin, xmin, ymin, xmax, ymax, pred_x, pred_y, ref_picture);

 //    s->mb_type[mb_y*s->mb_width + mb_x]= mb_type;
    mv_table[mot_xy][0]= mx;
    mv_table[mot_xy][1]= my;
+    return dmin;
 }


-int ff_decide_type(MpegEncContext * s,
-                int mb_x, int mb_y)
+static inline int check_bidir_mv(MpegEncContext * s,
+                   int mb_x, int mb_y,
+                   int motion_fx, int motion_fy,
+                   int motion_bx, int motion_by,
+                   int pred_fx, int pred_fy,
+                   int pred_bx, int pred_by)
 {
+    //FIXME optimize?
+    UINT16 *mv_penalty= s->mv_penalty[s->f_code] + MAX_MV; // f_code of the prev frame
+    uint8_t *dest_y = s->me_scratchpad;
+    uint8_t *ptr;
+    int dxy;
+    int src_x, src_y;
+    int fbmin;
+
+    fbmin = (mv_penalty[motion_fx-pred_fx] + mv_penalty[motion_fy-pred_fy])*s->qscale;
+
+    dxy = ((motion_fy & 1) << 1) | (motion_fx & 1);
+    src_x = mb_x * 16 + (motion_fx >> 1);
+    src_y = mb_y * 16 + (motion_fy >> 1);
+            
+    ptr = s->last_picture[0] + (src_y * s->linesize) + src_x;
+    put_pixels_tab[dxy](dest_y    , ptr    , s->linesize, 16);
+    put_pixels_tab[dxy](dest_y + 8, ptr + 8, s->linesize, 16);
+    
+    fbmin += (mv_penalty[motion_bx-pred_bx] + mv_penalty[motion_by-pred_by])*s->qscale;
+
+    dxy = ((motion_by & 1) << 1) | (motion_bx & 1);
+    src_x = mb_x * 16 + (motion_bx >> 1);
+    src_y = mb_y * 16 + (motion_by >> 1);
+            
+    ptr = s->next_picture[0] + (src_y * s->linesize) + src_x;
+    avg_pixels_tab[dxy](dest_y    , ptr    , s->linesize, 16);
+    avg_pixels_tab[dxy](dest_y + 8, ptr + 8, s->linesize, 16);
+    
+    fbmin += pix_abs16x16(s->new_picture[0] + mb_x*16 + mb_y*16*s->linesize, dest_y, s->linesize);
+    return fbmin;
+}

+/* refine the bidir vectors in hq mode and return the score in both lq & hq mode*/
+static inline int bidir_refine(MpegEncContext * s,
+                                  int mb_x, int mb_y)
+{
+    const int mot_stride = s->mb_width + 2;
+    const int xy = (mb_y + 1)*mot_stride + mb_x + 1;
+    int fbmin;
+    int pred_fx= s->b_bidir_forw_mv_table[xy-1][0];
+    int pred_fy= s->b_bidir_forw_mv_table[xy-1][1];
+    int pred_bx= s->b_bidir_back_mv_table[xy-1][0];
+    int pred_by= s->b_bidir_back_mv_table[xy-1][1];
+    int motion_fx= s->b_bidir_forw_mv_table[xy][0]= s->b_forw_mv_table[xy][0];
+    int motion_fy= s->b_bidir_forw_mv_table[xy][1]= s->b_forw_mv_table[xy][1];
+    int motion_bx= s->b_bidir_back_mv_table[xy][0]= s->b_back_mv_table[xy][0];
+    int motion_by= s->b_bidir_back_mv_table[xy][1]= s->b_back_mv_table[xy][1];
+
+    //FIXME do refinement and add flag
+    
+    fbmin= check_bidir_mv(s, mb_x, mb_y, 
+                          motion_fx, motion_fy,
+                          motion_bx, motion_by,
+                          pred_fx, pred_fy,
+                          pred_bx, pred_by);
+
+   return fbmin;
+}
+
+static inline int direct_search(MpegEncContext * s,
+                                int mb_x, int mb_y)
+{
+    int P[6][2];
+    const int mot_stride = s->mb_width + 2;
+    const int mot_xy = (mb_y + 1)*mot_stride + mb_x + 1;
+    int dmin, dmin2;
+    int motion_fx, motion_fy, motion_bx, motion_by, motion_bx0, motion_by0;
+    int motion_dx, motion_dy;
+    const int motion_px= s->p_mv_table[mot_xy][0];
+    const int motion_py= s->p_mv_table[mot_xy][1];
+    const int time_pp= s->pp_time;
+    const int time_bp= s->bp_time;
+    const int time_pb= time_pp - time_bp;
+    int bx, by;
+    int mx, my, mx2, my2;
+    uint8_t *ref_picture= s->me_scratchpad - (mb_x + 1 + (mb_y + 1)*s->linesize)*16;
+    int16_t (*mv_table)[2]= s->b_direct_mv_table;
+    uint16_t *mv_penalty= s->mv_penalty[s->f_code] + MAX_MV; // f_code of the prev frame
+
+    /* thanks to iso-mpeg the rounding is different for the zero vector, so we need to handle that ... */
+    motion_fx= (motion_px*time_pb)/time_pp;
+    motion_fy= (motion_py*time_pb)/time_pp;
+    motion_bx0= (-motion_px*time_bp)/time_pp;
+    motion_by0= (-motion_py*time_bp)/time_pp;
+    motion_dx= motion_dy=0;
+    dmin2= check_bidir_mv(s, mb_x, mb_y, 
+                          motion_fx, motion_fy,
+                          motion_bx0, motion_by0,
+                          motion_fx, motion_fy,
+                          motion_bx0, motion_by0) - s->qscale;
+
+    motion_bx= motion_fx - motion_px;
+    motion_by= motion_fy - motion_py;
+    for(by=-1; by<2; by++){
+        for(bx=-1; bx<2; bx++){
+            uint8_t *dest_y = s->me_scratchpad + (by+1)*s->linesize*16 + (bx+1)*16;
+            uint8_t *ptr;
+            int dxy;
+            int src_x, src_y;
+            const int width= s->width;
+            const int height= s->height;
+
+            dxy = ((motion_fy & 1) << 1) | (motion_fx & 1);
+            src_x = (mb_x + bx) * 16 + (motion_fx >> 1);
+            src_y = (mb_y + by) * 16 + (motion_fy >> 1);
+            src_x = clip(src_x, -16, width);
+            if (src_x == width) dxy &= ~1;
+            src_y = clip(src_y, -16, height);
+            if (src_y == height) dxy &= ~2;
+
+            ptr = s->last_picture[0] + (src_y * s->linesize) + src_x;
+            put_pixels_tab[dxy](dest_y    , ptr    , s->linesize, 16);
+            put_pixels_tab[dxy](dest_y + 8, ptr + 8, s->linesize, 16);
+
+            dxy = ((motion_by & 1) << 1) | (motion_bx & 1);
+            src_x = (mb_x + bx) * 16 + (motion_bx >> 1);
+            src_y = (mb_y + by) * 16 + (motion_by >> 1);
+            src_x = clip(src_x, -16, width);
+            if (src_x == width) dxy &= ~1;
+            src_y = clip(src_y, -16, height);
+            if (src_y == height) dxy &= ~2;
+
+            avg_pixels_tab[dxy](dest_y    , ptr    , s->linesize, 16);
+            avg_pixels_tab[dxy](dest_y + 8, ptr + 8, s->linesize, 16);
+        }
+    }
+
+    P[0][0] = mv_table[mot_xy    ][0];
+    P[0][1] = mv_table[mot_xy    ][1];
+    P[1][0] = mv_table[mot_xy - 1][0];
+    P[1][1] = mv_table[mot_xy - 1][1];
+
+    /* special case for first line */
+    if ((mb_y == 0 || s->first_slice_line || s->first_gob_line)) {
+        P[4][0] = P[1][0];
+        P[4][1] = P[1][1];
+    } else {
+        P[2][0] = mv_table[mot_xy - mot_stride             ][0];
+        P[2][1] = mv_table[mot_xy - mot_stride             ][1];
+        P[3][0] = mv_table[mot_xy - mot_stride + 1         ][0];
+        P[3][1] = mv_table[mot_xy - mot_stride + 1         ][1];
+    
+        P[4][0]= mid_pred(P[1][0], P[2][0], P[3][0]);
+        P[4][1]= mid_pred(P[1][1], P[2][1], P[3][1]);
+    }
+    dmin = epzs_motion_search(s, &mx, &my, P, 0, 0, -16, -16, 15, 15, ref_picture);
+    if(mx==0 && my==0) dmin=99999999; // not representable, due to rounding stuff
+    if(dmin2<dmin){ 
+        dmin= dmin2;
+        mx=0;
+        my=0;
+    }
+#if 1
+    mx2= mx= mx*2; 
+    my2= my= my*2;
+    for(by=-1; by<2; by++){
+        if(my2+by < -32) continue;
+        for(bx=-1; bx<2; bx++){
+            if(bx==0 && by==0) continue;
+            if(mx2+bx < -32) continue;
+            dmin2= check_bidir_mv(s, mb_x, mb_y, 
+                          mx2+bx+motion_fx, my2+by+motion_fy,
+                          mx2+bx+motion_bx, my2+by+motion_by,
+                          mx2+bx+motion_fx, my2+by+motion_fy,
+                          motion_bx, motion_by) - s->qscale;
+            
+            if(dmin2<dmin){
+                dmin=dmin2;
+                mx= mx2 + bx;
+                my= my2 + by;
+            }
+        }
+    }
+#else
+    mx*=2; my*=2;
+#endif
+    if(mx==0 && my==0){
+        motion_bx= motion_bx0;
+        motion_by= motion_by0;
+    }
+
+    s->b_direct_mv_table[mot_xy][0]= mx;
+    s->b_direct_mv_table[mot_xy][1]= my;
+    s->b_direct_forw_mv_table[mot_xy][0]= motion_fx + mx;
+    s->b_direct_forw_mv_table[mot_xy][1]= motion_fy + my;
+    s->b_direct_back_mv_table[mot_xy][0]= motion_bx + mx;
+    s->b_direct_back_mv_table[mot_xy][1]= motion_by + my;
+    return dmin;
 }

 void ff_estimate_b_frame_motion(MpegEncContext * s,
@ -1143,16 +1334,41 @@ void ff_estimate_b_frame_motion(MpegEncContext * s,
 {
    const int mot_stride = s->mb_width + 2;
    const int xy = (mb_y + 1)*mot_stride + mb_x + 1;
+    const int quant= s->qscale;
+    int fmin, bmin, dmin, fbmin;
+    int type=0;
+    int motion_fx, motion_fy, motion_bx, motion_by;
+    
+    dmin= direct_search(s, mb_x, mb_y);

-    ff_estimate_motion_b(s, mb_x, mb_y, s->b_forw_mv_table, s->last_picture[0], s->f_code);
-    ff_estimate_motion_b(s, mb_x, mb_y, s->b_back_mv_table, s->next_picture[0], s->b_code);
+    fmin= ff_estimate_motion_b(s, mb_x, mb_y, s->b_forw_mv_table, s->last_picture[0], s->f_code);
+    bmin= ff_estimate_motion_b(s, mb_x, mb_y, s->b_back_mv_table, s->next_picture[0], s->b_code) - quant;
 //printf(" %d %d ", s->b_forw_mv_table[xy][0], s->b_forw_mv_table[xy][1]);
-    s->b_bidir_forw_mv_table[xy][0]= s->b_forw_mv_table[xy][0];
-    s->b_bidir_forw_mv_table[xy][1]= s->b_forw_mv_table[xy][1];
-    s->b_bidir_back_mv_table[xy][0]= s->b_back_mv_table[xy][0];
-    s->b_bidir_back_mv_table[xy][1]= s->b_back_mv_table[xy][1];
-    
-    s->mb_type[mb_y*s->mb_width + mb_x]= MB_TYPE_FORWARD; //FIXME
+
+    fbmin= bidir_refine(s, mb_x, mb_y);
+
+    if(s->flags&CODEC_FLAG_HQ){
+        type= MB_TYPE_FORWARD | MB_TYPE_BACKWARD | MB_TYPE_BIDIR | MB_TYPE_DIRECT;
+    }else{
+        int score= dmin;
+        type=MB_TYPE_DIRECT;
+        
+        if(fmin<score){
+            score=fmin;
+            type= MB_TYPE_FORWARD; 
+        }
+        if(bmin<score){
+            score=bmin;
+            type= MB_TYPE_BACKWARD; 
+        }
+        if(fbmin<score){
+            score=fbmin;
+            type= MB_TYPE_BIDIR;
+        }
+        s->mc_mb_var += score;
+    }
+
+    s->mb_type[mb_y*s->mb_width + mb_x]= type;
 }

 /* find best f_code for ME which do unlimited searches */
@ -1184,8 +1400,12 @@ int ff_get_best_fcode(MpegEncContext * s, int16_t (*mv_table)[2], int type)
        }

        for(i=MAX_FCODE; i>1; i--){
+            int threshold;
            loose+= mv_num[i];
-            if(loose > s->mb_num/20) break; //FIXME this is pretty ineffective
+
+            if(s->pict_type==B_TYPE) threshold= 0;
+            else                     threshold= s->mb_num/20; //FIXME 
+            if(loose > threshold) break;
        }
 //    printf("fcode: %d type: %d\n", i, s->pict_type);
        return i;
@ -1275,11 +1495,12 @@ void ff_fix_long_b_mvs(MpegEncContext * s, int16_t (*mv_table)[2], int f_code, i
                   || fcode_tab[mv_table[xy][0] + MAX_MV] == 0
                   || fcode_tab[mv_table[xy][1] + MAX_MV] > f_code
                   || fcode_tab[mv_table[xy][1] + MAX_MV] == 0 ){
-                    s->mb_type[i] &= ~type;
-                    if(s->mb_type[i]==0) s->mb_type[i]= MB_TYPE_FORWARD; //FIXME 
-                    mv_table[xy][0] = 0;
-                    mv_table[xy][1] = 0;
-                    //this is certainly bad FIXME
+                    if(s->mb_type[i]&(~type)) s->mb_type[i] &= ~type;
+                    else{
+                        mv_table[xy][0] = 0;
+                        mv_table[xy][1] = 0;
+                        //this is certainly bad FIXME            
+                    }
                }
            }
            xy++;
--- a/libavcodec/mpegvideo.c
+++ b/libavcodec/mpegvideo.c
@ -225,6 +225,12 @@ int MPV_common_init(MpegEncContext *s)
            goto fail;
        }

+        s->me_scratchpad = av_mallocz( s->linesize*16*3*sizeof(uint8_t));
+        if (s->me_scratchpad == NULL) {
+            perror("malloc");
+            goto fail;
+        }
+
        if(s->max_b_frames){
            for(j=0; j<REORDER_BUFFER_SIZE; j++){
                int i;
@ -297,7 +303,7 @@ int MPV_common_init(MpegEncContext *s)
    if (!s->mbskip_table)
        goto fail;
    
-    s->block= s->intra_block;
+    s->block= s->blocks[0];

    s->context_initialized = 1;
    return 0;
@ -333,6 +339,7 @@ void MPV_common_end(MpegEncContext *s)
    CHECK_FREE(s->ac_val[0]);
    CHECK_FREE(s->coded_block);
    CHECK_FREE(s->mbintra_table);
+    CHECK_FREE(s->me_scratchpad);

    CHECK_FREE(s->mbskip_table);
    for(i=0;i<3;i++) {
@ -761,16 +768,6 @@ int MPV_encode_picture(AVCodecContext *avctx,
    return pbBufPtr(&s->pb) - s->pb.buf;
 }

-static inline int clip(int a, int amin, int amax)
-{
-    if (a < amin)
-        return amin;
-    else if (a > amax)
-        return amax;
-    else
-        return a;
-}
-
 static inline void gmc1_motion(MpegEncContext *s,
                               UINT8 *dest_y, UINT8 *dest_cb, UINT8 *dest_cr,
                               int dest_offset,
@ -1225,7 +1222,7 @@ void MPV_decode_mb(MpegEncContext *s, DCTELEM block[6][64])
        if (!s->mb_intra) {
            /* motion handling */
            if((s->flags&CODEC_FLAG_HQ) || (!s->encoding)){
-                if (!s->no_rounding){
+                if ((!s->no_rounding) || s->pict_type==B_TYPE){                
                    op_pix = put_pixels_tab;
                    op_qpix= qpel_mc_rnd_tab;
                }else{
@ -1235,7 +1232,7 @@ void MPV_decode_mb(MpegEncContext *s, DCTELEM block[6][64])

                if (s->mv_dir & MV_DIR_FORWARD) {
                    MPV_motion(s, dest_y, dest_cb, dest_cr, 0, s->last_picture, op_pix, op_qpix);
-                    if (!s->no_rounding) 
+                    if ((!s->no_rounding) || s->pict_type==B_TYPE)
                        op_pix = avg_pixels_tab;
                    else
                        op_pix = avg_no_rnd_pixels_tab;
@ -1312,7 +1309,7 @@ static void encode_mb(MpegEncContext *s, int motion_x, int motion_y)
        dest_cb = s->current_picture[1] + (mb_y * 8  * (s->linesize >> 1)) + mb_x * 8;
        dest_cr = s->current_picture[2] + (mb_y * 8  * (s->linesize >> 1)) + mb_x * 8;

-        if (!s->no_rounding){
+        if ((!s->no_rounding) || s->pict_type==B_TYPE){
            op_pix = put_pixels_tab;
            op_qpix= qpel_mc_rnd_tab;
        }else{
@ -1322,7 +1319,7 @@ static void encode_mb(MpegEncContext *s, int motion_x, int motion_y)

        if (s->mv_dir & MV_DIR_FORWARD) {
            MPV_motion(s, dest_y, dest_cb, dest_cr, 0, s->last_picture, op_pix, op_qpix);
-            if (!s->no_rounding) 
+           if ((!s->no_rounding) || s->pict_type==B_TYPE)
                op_pix = avg_pixels_tab;
            else
                op_pix = avg_no_rnd_pixels_tab;
@ -1429,6 +1426,8 @@ static void copy_context_before_encode(MpegEncContext *d, MpegEncContext *s, int
    d->skip_count= s->skip_count;
    d->misc_bits= s->misc_bits;
    d->last_bits= s->last_bits;
+
+    d->mb_skiped= s->mb_skiped;
 }

 static void copy_context_after_encode(MpegEncContext *d, MpegEncContext *s, int type){
@ -1453,6 +1452,7 @@ static void copy_context_after_encode(MpegEncContext *d, MpegEncContext *s, int
    d->last_bits= s->last_bits;

    d->mb_intra= s->mb_intra;
+    d->mb_skiped= s->mb_skiped;
    d->mv_type= s->mv_type;
    d->mv_dir= s->mv_dir;
    d->pb= s->pb;
@ -1468,7 +1468,7 @@ static void encode_picture(MpegEncContext *s, int picture_number)
    int i;
    int bits;
    MpegEncContext best_s, backup_s;
-    UINT8 bit_buf[4][3000]; //FIXME check that this is ALLWAYS large enogh for a MB
+    UINT8 bit_buf[7][3000]; //FIXME check that this is ALLWAYS large enogh for a MB

    s->picture_number = picture_number;

@ -1483,7 +1483,11 @@ static void encode_picture(MpegEncContext *s, int picture_number)
    /* Reset the average MB variance */
    s->avg_mb_var = 0;
    s->mc_mb_var = 0;
-    
+
+    /* we need to initialize some time vars before we can encode b-frames */
+    if (s->h263_pred && !s->h263_msmpeg4)
+        ff_set_mpeg4_time(s, s->picture_number); 
+
    /* Estimate motion for every MB */
    if(s->pict_type != I_TYPE){
 //        int16_t (*tmp)[2]= s->p_mv_table;
@ -1535,9 +1539,11 @@ static void encode_picture(MpegEncContext *s, int picture_number)
    if(s->pict_type==B_TYPE){
        s->f_code= ff_get_best_fcode(s, s->b_forw_mv_table, MB_TYPE_FORWARD);
        s->b_code= ff_get_best_fcode(s, s->b_back_mv_table, MB_TYPE_BACKWARD);
-        //FIXME if BIDIR != for&back
-        ff_fix_long_b_mvs(s, s->b_forw_mv_table, s->f_code, MB_TYPE_FORWARD |MB_TYPE_BIDIR);
-        ff_fix_long_b_mvs(s, s->b_back_mv_table, s->b_code, MB_TYPE_BACKWARD|MB_TYPE_BIDIR);
+
+        ff_fix_long_b_mvs(s, s->b_forw_mv_table, s->f_code, MB_TYPE_FORWARD);
+        ff_fix_long_b_mvs(s, s->b_back_mv_table, s->b_code, MB_TYPE_BACKWARD);
+        ff_fix_long_b_mvs(s, s->b_bidir_forw_mv_table, s->f_code, MB_TYPE_BIDIR);
+        ff_fix_long_b_mvs(s, s->b_bidir_back_mv_table, s->b_code, MB_TYPE_BIDIR);
    }
    
 //printf("f_code %d ///\n", s->f_code);
@ -1632,7 +1638,8 @@ static void encode_picture(MpegEncContext *s, int picture_number)
        s->block_index[4]= s->block_wrap[4]*(mb_y + 1)                    + s->block_wrap[0]*(s->mb_height*2 + 2);
        s->block_index[5]= s->block_wrap[4]*(mb_y + 1 + s->mb_height + 2) + s->block_wrap[0]*(s->mb_height*2 + 2);
        for(mb_x=0; mb_x < s->mb_width; mb_x++) {
-            /*const */int mb_type= s->mb_type[mb_y * s->mb_width + mb_x];
+            const int mb_type= s->mb_type[mb_y * s->mb_width + mb_x];
+            const int xy= (mb_y+1) * (s->mb_width+2) + mb_x + 1;
            PutBitContext pb;
            int d;
            int dmin=10000000;
@ -1647,19 +1654,19 @@ static void encode_picture(MpegEncContext *s, int picture_number)
            s->block_index[4]++;
            s->block_index[5]++;
            if(mb_type & (mb_type-1)){ // more than 1 MB type possible
+                int next_block=0;
                pb= s->pb;
-                s->mv_dir = MV_DIR_FORWARD;

                copy_context_before_encode(&backup_s, s, -1);

                if(mb_type&MB_TYPE_INTER){
-                    int xy= (mb_y+1) * (s->mb_width+2) + mb_x + 1;
+                    s->mv_dir = MV_DIR_FORWARD;
                    s->mv_type = MV_TYPE_16X16;
                    s->mb_intra= 0;
                    s->mv[0][0][0] = s->p_mv_table[xy][0];
                    s->mv[0][0][1] = s->p_mv_table[xy][1];
                    init_put_bits(&s->pb, bit_buf[1], 3000, NULL, NULL);
-                    s->block= s->inter_block;
+                    s->block= s->blocks[next_block];

                    encode_mb(s, s->mv[0][0][0], s->mv[0][0][1]);
                    d= get_bit_count(&s->pb);
@ -1668,10 +1675,12 @@ static void encode_picture(MpegEncContext *s, int picture_number)
                        dmin=d;
                        copy_context_after_encode(&best_s, s, MB_TYPE_INTER);
                        best=1;
+                        next_block^=1;
                    }
                }
                if(mb_type&MB_TYPE_INTER4V){                 
                    copy_context_before_encode(s, &backup_s, MB_TYPE_INTER4V);
+                    s->mv_dir = MV_DIR_FORWARD;
                    s->mv_type = MV_TYPE_8X8;
                    s->mb_intra= 0;
                    for(i=0; i<4; i++){
@ -1679,25 +1688,111 @@ static void encode_picture(MpegEncContext *s, int picture_number)
                        s->mv[0][i][1] = s->motion_val[s->block_index[i]][1];
                    }
                    init_put_bits(&s->pb, bit_buf[2], 3000, NULL, NULL);
-                    s->block= s->inter4v_block;
+                    s->block= s->blocks[next_block];

                    encode_mb(s, 0, 0);
                    d= get_bit_count(&s->pb);
-                    if(d<dmin && 0){
+                    if(d<dmin){
                        flush_put_bits(&s->pb);
                        dmin=d;
                        copy_context_after_encode(&best_s, s, MB_TYPE_INTER4V);
                        best=2;
+                        next_block^=1;
+                    }
+                }
+                if(mb_type&MB_TYPE_FORWARD){
+                    copy_context_before_encode(s, &backup_s, MB_TYPE_FORWARD);
+                    s->mv_dir = MV_DIR_FORWARD;
+                    s->mv_type = MV_TYPE_16X16;
+                    s->mb_intra= 0;
+                    s->mv[0][0][0] = s->b_forw_mv_table[xy][0];
+                    s->mv[0][0][1] = s->b_forw_mv_table[xy][1];
+                    init_put_bits(&s->pb, bit_buf[3], 3000, NULL, NULL);
+                    s->block= s->blocks[next_block];
+
+                    encode_mb(s, s->mv[0][0][0], s->mv[0][0][1]);
+                    d= get_bit_count(&s->pb);
+                    if(d<dmin){
+                        flush_put_bits(&s->pb);
+                        dmin=d;
+                        copy_context_after_encode(&best_s, s, MB_TYPE_FORWARD);
+                        best=3;
+                        next_block^=1;
+                    }
+                }
+                if(mb_type&MB_TYPE_BACKWARD){
+                    copy_context_before_encode(s, &backup_s, MB_TYPE_BACKWARD);
+                    s->mv_dir = MV_DIR_BACKWARD;
+                    s->mv_type = MV_TYPE_16X16;
+                    s->mb_intra= 0;
+                    s->mv[1][0][0] = s->b_back_mv_table[xy][0];
+                    s->mv[1][0][1] = s->b_back_mv_table[xy][1];
+                    init_put_bits(&s->pb, bit_buf[4], 3000, NULL, NULL);
+                    s->block= s->blocks[next_block];
+
+                    encode_mb(s, s->mv[1][0][0], s->mv[1][0][1]);
+                    d= get_bit_count(&s->pb);
+                    if(d<dmin){
+                        flush_put_bits(&s->pb);
+                        dmin=d;
+                        copy_context_after_encode(&best_s, s, MB_TYPE_BACKWARD);
+                        best=4;
+                        next_block^=1;
+                    }
+                }
+                if(mb_type&MB_TYPE_BIDIR){
+                    copy_context_before_encode(s, &backup_s, MB_TYPE_BIDIR);
+                    s->mv_dir = MV_DIR_FORWARD | MV_DIR_BACKWARD;
+                    s->mv_type = MV_TYPE_16X16;
+                    s->mb_intra= 0;
+                    s->mv[0][0][0] = s->b_bidir_forw_mv_table[xy][0];
+                    s->mv[0][0][1] = s->b_bidir_forw_mv_table[xy][1];
+                    s->mv[1][0][0] = s->b_bidir_back_mv_table[xy][0];
+                    s->mv[1][0][1] = s->b_bidir_back_mv_table[xy][1];
+                    init_put_bits(&s->pb, bit_buf[5], 3000, NULL, NULL);
+                    s->block= s->blocks[next_block];
+
+                    encode_mb(s, 0, 0);
+                    d= get_bit_count(&s->pb);
+                    if(d<dmin){
+                        flush_put_bits(&s->pb);
+                        dmin=d;
+                        copy_context_after_encode(&best_s, s, MB_TYPE_BIDIR);
+                        best=5;
+                        next_block^=1;
+                    }
+                }
+                if(mb_type&MB_TYPE_DIRECT){
+                    copy_context_before_encode(s, &backup_s, MB_TYPE_DIRECT);
+                    s->mv_dir = MV_DIR_FORWARD | MV_DIR_BACKWARD | MV_DIRECT;
+                    s->mv_type = MV_TYPE_16X16; //FIXME
+                    s->mb_intra= 0;
+                    s->mv[0][0][0] = s->b_direct_forw_mv_table[xy][0];
+                    s->mv[0][0][1] = s->b_direct_forw_mv_table[xy][1];
+                    s->mv[1][0][0] = s->b_direct_back_mv_table[xy][0];
+                    s->mv[1][0][1] = s->b_direct_back_mv_table[xy][1];
+                    init_put_bits(&s->pb, bit_buf[6], 3000, NULL, NULL);
+                    s->block= s->blocks[next_block];
+
+                    encode_mb(s, s->b_direct_mv_table[xy][0], s->b_direct_mv_table[xy][1]);
+                    d= get_bit_count(&s->pb);
+                    if(d<dmin){
+                        flush_put_bits(&s->pb);
+                        dmin=d;
+                        copy_context_after_encode(&best_s, s, MB_TYPE_DIRECT);
+                        best=6;
+                        next_block^=1;
                    }
                }
                if(mb_type&MB_TYPE_INTRA){
                    copy_context_before_encode(s, &backup_s, MB_TYPE_INTRA);
+                    s->mv_dir = MV_DIR_FORWARD;
                    s->mv_type = MV_TYPE_16X16;
                    s->mb_intra= 1;
                    s->mv[0][0][0] = 0;
                    s->mv[0][0][1] = 0;
                    init_put_bits(&s->pb, bit_buf[0], 3000, NULL, NULL);
-                    s->block= s->intra_block;
+                    s->block= s->blocks[next_block];
                   
                    encode_mb(s, 0, 0);
                    d= get_bit_count(&s->pb);
@ -1706,6 +1801,7 @@ static void encode_picture(MpegEncContext *s, int picture_number)
                        dmin=d;
                        copy_context_after_encode(&best_s, s, MB_TYPE_INTRA);
                        best=0;
+                        next_block^=1;
                    }
                    /* force cleaning of ac/dc pred stuff if needed ... */
                    if(s->h263_pred || s->h263_aic)
@ -1718,30 +1814,30 @@ static void encode_picture(MpegEncContext *s, int picture_number)
                int motion_x, motion_y;
                s->mv_type=MV_TYPE_16X16;
                // only one MB-Type possible
-                //FIXME convert to swicth()
-                if(mb_type&MB_TYPE_INTRA){
+                switch(mb_type){
+                case MB_TYPE_INTRA:
                    s->mv_dir = MV_DIR_FORWARD;
                    s->mb_intra= 1;
                    motion_x= s->mv[0][0][0] = 0;
                    motion_y= s->mv[0][0][1] = 0;
-                }else if(mb_type&MB_TYPE_INTER){
-                    int xy= (mb_y+1) * (s->mb_width+2) + mb_x + 1;
+                    break;
+                case MB_TYPE_INTER:
                    s->mv_dir = MV_DIR_FORWARD;
                    s->mb_intra= 0;
                    motion_x= s->mv[0][0][0] = s->p_mv_table[xy][0];
                    motion_y= s->mv[0][0][1] = s->p_mv_table[xy][1];
-                }else if(mb_type&MB_TYPE_DIRECT){
-                    int xy= (mb_y+1) * (s->mb_width+2) + mb_x + 1;
+                    break;
+                case MB_TYPE_DIRECT:
                    s->mv_dir = MV_DIR_FORWARD | MV_DIR_BACKWARD | MV_DIRECT;
                    s->mb_intra= 0;
-                    motion_x=0;
-                    motion_y=0;
-                    s->mv[0][0][0] = 0;
-                    s->mv[0][0][1] = 0;
-                    s->mv[1][0][0] = 0;
-                    s->mv[1][0][1] = 0;
-                }else if(mb_type&MB_TYPE_BIDIR){
-                    int xy= (mb_y+1) * (s->mb_width+2) + mb_x + 1;
+                    motion_x=s->b_direct_mv_table[xy][0];
+                    motion_y=s->b_direct_mv_table[xy][1];
+                    s->mv[0][0][0] = s->b_direct_forw_mv_table[xy][0];
+                    s->mv[0][0][1] = s->b_direct_forw_mv_table[xy][1];
+                    s->mv[1][0][0] = s->b_direct_back_mv_table[xy][0];
+                    s->mv[1][0][1] = s->b_direct_back_mv_table[xy][1];
+                    break;
+                case MB_TYPE_BIDIR:
                    s->mv_dir = MV_DIR_FORWARD | MV_DIR_BACKWARD;
                    s->mb_intra= 0;
                    motion_x=0;
@ -1750,25 +1846,31 @@ static void encode_picture(MpegEncContext *s, int picture_number)
                    s->mv[0][0][1] = s->b_bidir_forw_mv_table[xy][1];
                    s->mv[1][0][0] = s->b_bidir_back_mv_table[xy][0];
                    s->mv[1][0][1] = s->b_bidir_back_mv_table[xy][1];
-                }else if(mb_type&MB_TYPE_BACKWARD){
-                    int xy= (mb_y+1) * (s->mb_width+2) + mb_x + 1;
+                    break;
+                case MB_TYPE_BACKWARD:
                    s->mv_dir = MV_DIR_BACKWARD;
                    s->mb_intra= 0;
                    motion_x= s->mv[1][0][0] = s->b_back_mv_table[xy][0];
                    motion_y= s->mv[1][0][1] = s->b_back_mv_table[xy][1];
-                }else if(mb_type&MB_TYPE_FORWARD){
-                    int xy= (mb_y+1) * (s->mb_width+2) + mb_x + 1;
+                    break;
+                case MB_TYPE_FORWARD:
                    s->mv_dir = MV_DIR_FORWARD;
                    s->mb_intra= 0;
                    motion_x= s->mv[0][0][0] = s->b_forw_mv_table[xy][0];
                    motion_y= s->mv[0][0][1] = s->b_forw_mv_table[xy][1];
 //                    printf(" %d %d ", motion_x, motion_y);
-                }else{
+                    break;
+                default:
                    motion_x=motion_y=0; //gcc warning fix
                    printf("illegal MB type\n");
                }
                encode_mb(s, motion_x, motion_y);
            }
+            /* clean the MV table in IPS frames for direct mode in B frames */
+            if(s->mb_intra /* && I,P,S_TYPE */){
+                s->p_mv_table[xy][0]=0;
+                s->p_mv_table[xy][1]=0;
+            }

            MPV_decode_mb(s, s->block);
        }
--- a/libavcodec/mpegvideo.h
+++ b/libavcodec/mpegvideo.h
@ -141,7 +141,8 @@ typedef struct MpegEncContext {
    INT16 (*b_direct_forw_mv_table)[2];/* MV table (1MV per MB) direct mode b-frame encoding */
    INT16 (*b_direct_back_mv_table)[2];/* MV table (1MV per MB) direct mode b-frame encoding */
    INT16 (*b_direct_mv_table)[2];     /* MV table (1MV per MB) direct mode b-frame encoding */
-    int me_method;          /* ME algorithm */
+    int me_method;                     /* ME algorithm */
+    uint8_t *me_scratchpad;            /* data area for the me algo, so that the ME doesnt need to malloc/free */
    int mv_dir;
 #define MV_DIR_BACKWARD  1
 #define MV_DIR_FORWARD   2
@ -164,7 +165,8 @@ typedef struct MpegEncContext {
    UINT8 *fcode_tab; /* smallest fcode needed for each MV */

    int has_b_frames;
-    int no_rounding; /* apply no rounding to motion compensation (MPEG4, msmpeg4, ...) */
+    int no_rounding; /* apply no rounding to motion compensation (MPEG4, msmpeg4, ...) 
+                        for b-frames rounding mode is allways 0 */

    /* macroblock layer */
    int mb_x, mb_y;
@ -335,9 +337,7 @@ typedef struct MpegEncContext {
    UINT32 mb_line_avgsize;
    
    DCTELEM (*block)[64]; /* points to one of the following blocks */
-    DCTELEM intra_block[6][64] __align8;
-    DCTELEM inter_block[6][64] __align8;
-    DCTELEM inter4v_block[6][64] __align8;
+    DCTELEM blocks[2][6][64] __align8; // for HQ mode we need to keep the best block
    void (*dct_unquantize_mpeg1)(struct MpegEncContext *s, 
                           DCTELEM *block, int n, int qscale);
    void (*dct_unquantize_mpeg2)(struct MpegEncContext *s, 
@ -421,6 +421,7 @@ INT16 *h263_pred_motion(MpegEncContext * s, int block,
                        int *px, int *py);
 void mpeg4_pred_ac(MpegEncContext * s, INT16 *block, int n, 
                   int dir);
+void ff_set_mpeg4_time(MpegEncContext * s, int picture_number);
 void mpeg4_encode_picture_header(MpegEncContext *s, int picture_number);
 void h263_encode_init(MpegEncContext *s);