H264: change weight/biweight functions to take a height argument.

Neon parts by Mans Rullgard <mans@mansr.com>.
13 years ago · c2d337429c
parent 229d263cc9
commit c2d337429c
10 changed files with 337 additions and 592 deletions
--- a/libavcodec/arm/h264dsp_init_arm.c
+++ b/libavcodec/arm/h264dsp_init_arm.c
@ -32,47 +32,22 @@ void ff_h264_v_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha,
 void ff_h264_h_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha,
                                       int beta, int8_t *tc0);
-void ff_weight_h264_pixels_16x16_neon(uint8_t *ds, int stride, int log2_den,
+void ff_weight_h264_pixels_16_neon(uint8_t *dst, int stride, int height,
-                                      int weight, int offset);
+                                   int log2_den, int weight, int offset);
-void ff_weight_h264_pixels_16x8_neon(uint8_t *ds, int stride, int log2_den,
+void ff_weight_h264_pixels_8_neon(uint8_t *dst, int stride, int height,
-                                     int weight, int offset);
+                                  int log2_den, int weight, int offset);
-void ff_weight_h264_pixels_8x16_neon(uint8_t *ds, int stride, int log2_den,
+void ff_weight_h264_pixels_4_neon(uint8_t *dst, int stride, int height,
-                                     int weight, int offset);
+                                  int log2_den, int weight, int offset);
 void ff_weight_h264_pixels_8x8_neon(uint8_t *ds, int stride, int log2_den,
                                    int weight, int offset);
 void ff_weight_h264_pixels_8x4_neon(uint8_t *ds, int stride, int log2_den,
                                    int weight, int offset);
 void ff_weight_h264_pixels_4x8_neon(uint8_t *ds, int stride, int log2_den,
                                    int weight, int offset);
 void ff_weight_h264_pixels_4x4_neon(uint8_t *ds, int stride, int log2_den,
                                    int weight, int offset);
 void ff_weight_h264_pixels_4x2_neon(uint8_t *ds, int stride, int log2_den,
                                    int weight, int offset);
-void ff_biweight_h264_pixels_16x16_neon(uint8_t *dst, uint8_t *src, int stride,
+void ff_biweight_h264_pixels_16_neon(uint8_t *dst, uint8_t *src, int stride,
-                                        int log2_den, int weightd, int weights,
+                                     int height, int log2_den, int weightd,
-                                        int offset);
+                                     int weights, int offset);
-void ff_biweight_h264_pixels_16x8_neon(uint8_t *dst, uint8_t *src, int stride,
+void ff_biweight_h264_pixels_8_neon(uint8_t *dst, uint8_t *src, int stride,
-                                       int log2_den, int weightd, int weights,
+                                    int height, int log2_den, int weightd,
-                                       int offset);
+                                    int weights, int offset);
-void ff_biweight_h264_pixels_8x16_neon(uint8_t *dst, uint8_t *src, int stride,
+void ff_biweight_h264_pixels_4_neon(uint8_t *dst, uint8_t *src, int stride,
-                                       int log2_den, int weightd, int weights,
+                                    int height, int log2_den, int weightd,
-                                       int offset);
+                                    int weights, int offset);
 void ff_biweight_h264_pixels_8x8_neon(uint8_t *dst, uint8_t *src, int stride,
                                      int log2_den, int weightd, int weights,
                                      int offset);
 void ff_biweight_h264_pixels_8x4_neon(uint8_t *dst, uint8_t *src, int stride,
                                      int log2_den, int weightd, int weights,
                                      int offset);
 void ff_biweight_h264_pixels_4x8_neon(uint8_t *dst, uint8_t *src, int stride,
                                      int log2_den, int weightd, int weights,
                                      int offset);
 void ff_biweight_h264_pixels_4x4_neon(uint8_t *dst, uint8_t *src, int stride,
                                      int log2_den, int weightd, int weights,
                                      int offset);
 void ff_biweight_h264_pixels_4x2_neon(uint8_t *dst, uint8_t *src, int stride,
                                      int log2_den, int weightd, int weights,
                                      int offset);
 void ff_h264_idct_add_neon(uint8_t *dst, DCTELEM *block, int stride);
 void ff_h264_idct_dc_add_neon(uint8_t *dst, DCTELEM *block, int stride);
@ -100,23 +75,13 @@ static void ff_h264dsp_init_neon(H264DSPContext *c, const int bit_depth, const i
    c->h264_v_loop_filter_chroma = ff_h264_v_loop_filter_chroma_neon;
    c->h264_h_loop_filter_chroma = ff_h264_h_loop_filter_chroma_neon;
-    c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels_16x16_neon;
+    c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels_16_neon;
-    c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels_16x8_neon;
+    c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels_8_neon;
-    c->weight_h264_pixels_tab[2] = ff_weight_h264_pixels_8x16_neon;
+    c->weight_h264_pixels_tab[2] = ff_weight_h264_pixels_4_neon;
    c->weight_h264_pixels_tab[3] = ff_weight_h264_pixels_8x8_neon;
    c->weight_h264_pixels_tab[4] = ff_weight_h264_pixels_8x4_neon;
    c->weight_h264_pixels_tab[5] = ff_weight_h264_pixels_4x8_neon;
    c->weight_h264_pixels_tab[6] = ff_weight_h264_pixels_4x4_neon;
    c->weight_h264_pixels_tab[7] = ff_weight_h264_pixels_4x2_neon;
-    c->biweight_h264_pixels_tab[0] = ff_biweight_h264_pixels_16x16_neon;
+    c->biweight_h264_pixels_tab[0] = ff_biweight_h264_pixels_16_neon;
-    c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels_16x8_neon;
+    c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels_8_neon;
-    c->biweight_h264_pixels_tab[2] = ff_biweight_h264_pixels_8x16_neon;
+    c->biweight_h264_pixels_tab[2] = ff_biweight_h264_pixels_4_neon;
    c->biweight_h264_pixels_tab[3] = ff_biweight_h264_pixels_8x8_neon;
    c->biweight_h264_pixels_tab[4] = ff_biweight_h264_pixels_8x4_neon;
    c->biweight_h264_pixels_tab[5] = ff_biweight_h264_pixels_4x8_neon;
    c->biweight_h264_pixels_tab[6] = ff_biweight_h264_pixels_4x4_neon;
    c->biweight_h264_pixels_tab[7] = ff_biweight_h264_pixels_4x2_neon;
    c->h264_idct_add        = ff_h264_idct_add_neon;
    c->h264_idct_dc_add     = ff_h264_idct_dc_add_neon;
--- a/libavcodec/arm/h264dsp_neon.S
+++ b/libavcodec/arm/h264dsp_neon.S
@ -1592,7 +1592,7 @@ endfunc
        vdup.8          d1,  r5
        vmov            q2,  q8
        vmov            q3,  q8
-1:      subs            ip,  ip,  #2
+1:      subs            r3,  r3,  #2
        vld1.8          {d20-d21},[r0,:128], r2
        \macd           q2,  d0,  d20
        pld             [r0]
@ -1632,7 +1632,7 @@ endfunc
        vdup.8          d1,  r5
        vmov            q1,  q8
        vmov            q10, q8
-1:      subs            ip,  ip,  #2
+1:      subs            r3,  r3,  #2
        vld1.8          {d4},[r0,:64], r2
        \macd           q1,  d0,  d4
        pld             [r0]
@ -1662,7 +1662,7 @@ endfunc
        vdup.8          d1,  r5
        vmov            q1,  q8
        vmov            q10, q8
-1:      subs            ip,  ip,  #4
+1:      subs            r3,  r3,  #4
        vld1.32         {d4[0]},[r0,:32], r2
        vld1.32         {d4[1]},[r0,:32], r2
        \macd           q1,  d0,  d4
@ -1700,16 +1700,17 @@ endfunc
        .endm
        .macro  biweight_func w
-function biweight_h264_pixels_\w\()_neon
+function ff_biweight_h264_pixels_\w\()_neon, export=1
        push            {r4-r6, lr}
-        add             r4,  sp,  #16
+        ldr             r12, [sp, #16]
        add             r4,  sp,  #20
        ldm             r4,  {r4-r6}
        lsr             lr,  r4,  #31
        add             r6,  r6,  #1
        eors            lr,  lr,  r5,  lsr #30
        orr             r6,  r6,  #1
-        vdup.16         q9,  r3
+        vdup.16         q9,  r12
-        lsl             r6,  r6,  r3
+        lsl             r6,  r6,  r12
        vmvn            q9,  q9
        vdup.16         q8,  r6
        mov             r6,  r0
@ -1730,34 +1731,15 @@ function biweight_h264_pixels_\w\()_neon
 endfunc
        .endm
        .macro  biweight_entry w, h, b=1
 function ff_biweight_h264_pixels_\w\()x\h\()_neon, export=1
        mov             ip,  #\h
 .if \b
        b               biweight_h264_pixels_\w\()_neon
 .endif
 endfunc
        .endm
        biweight_entry  16, 8
        biweight_entry  16, 16, b=0
        biweight_func   16
        biweight_entry  8,  16
        biweight_entry  8,  4
        biweight_entry  8,  8,  b=0
        biweight_func   8
        biweight_entry  4,  8
        biweight_entry  4,  2
        biweight_entry  4,  4,  b=0
        biweight_func   4
@ Weighted prediction
        .macro  weight_16 add
-        vdup.8          d0,  r3
+        vdup.8          d0,  r12
-1:      subs            ip,  ip,  #2
+1:      subs            r2,  r2,  #2
        vld1.8          {d20-d21},[r0,:128], r1
        vmull.u8        q2,  d0,  d20
        pld             [r0]
@ -1785,8 +1767,8 @@ endfunc
        .endm
        .macro  weight_8 add
-        vdup.8          d0,  r3
+        vdup.8          d0,  r12
-1:      subs            ip,  ip,  #2
+1:      subs            r2,  r2,  #2
        vld1.8          {d4},[r0,:64], r1
        vmull.u8        q1,  d0,  d4
        pld             [r0]
@ -1806,10 +1788,10 @@ endfunc
        .endm
        .macro  weight_4 add
-        vdup.8          d0,  r3
+        vdup.8          d0,  r12
        vmov            q1,  q8
        vmov            q10, q8
-1:      subs            ip,  ip,  #4
+1:      subs            r2,  r2,  #4
        vld1.32         {d4[0]},[r0,:32], r1
        vld1.32         {d4[1]},[r0,:32], r1
        vmull.u8        q1,  d0,  d4
@ -1842,50 +1824,32 @@ endfunc
        .endm
        .macro  weight_func w
-function weight_h264_pixels_\w\()_neon
+function ff_weight_h264_pixels_\w\()_neon, export=1
        push            {r4, lr}
-        ldr             r4,  [sp, #8]
+        ldr             r12, [sp, #8]
-        cmp             r2,  #1
+        ldr             r4,  [sp, #12]
-        lsl             r4,  r4,  r2
+        cmp             r3,  #1
        lsl             r4,  r4,  r3
        vdup.16         q8,  r4
        mov             r4,  r0
        ble             20f
-        rsb             lr,  r2,  #1
+        rsb             lr,  r3,  #1
        vdup.16         q9,  lr
-        cmp             r3,  #0
+        cmp             r12, #0
        blt             10f
        weight_\w       vhadd.s16
-10:     rsb             r3,  r3,  #0
+10:     rsb             r12, r12, #0
        weight_\w       vhsub.s16
-20:     rsb             lr,  r2,  #0
+20:     rsb             lr,  r3,  #0
        vdup.16         q9,  lr
-        cmp             r3,  #0
+        cmp             r12, #0
        blt             10f
        weight_\w       vadd.s16
-10:     rsb             r3,  r3,  #0
+10:     rsb             r12, r12, #0
        weight_\w       vsub.s16
 endfunc
        .endm
        .macro  weight_entry w, h, b=1
 function ff_weight_h264_pixels_\w\()x\h\()_neon, export=1
        mov             ip,  #\h
 .if \b
        b               weight_h264_pixels_\w\()_neon
 .endif
 endfunc
        .endm
        weight_entry    16, 8
        weight_entry    16, 16, b=0
        weight_func     16
        weight_entry    8,  16
        weight_entry    8,  4
        weight_entry    8,  8,  b=0
        weight_func     8
        weight_entry    4,  8
        weight_entry    4,  2
        weight_entry    4,  4,  b=0
        weight_func     4
--- a/libavcodec/h264.c
+++ b/libavcodec/h264.c
@ -438,7 +438,8 @@ static void chroma_dc_dct_c(DCTELEM *block){
 }
 #endif
-static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square, int chroma_height, int delta, int list,
+static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square,
                               int height, int delta, int list,
                           uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
                           int src_x_offset, int src_y_offset,
                           qpel_mc_func *qpix_op, h264_chroma_mc_func chroma_op,
@ -518,16 +519,16 @@ static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square,
        s->dsp.emulated_edge_mc(s->edge_emu_buffer, src_cb, h->mb_uvlinesize, 9, (16>>!(CHROMA422)) + 1, (mx>>3), (my>>ysh), pic_width>>1, pic_height>>!(CHROMA422));
            src_cb= s->edge_emu_buffer;
    }
-    chroma_op(dest_cb, src_cb, h->mb_uvlinesize, chroma_height << !!(CHROMA422), mx&7, (my << !!(CHROMA422)) &7);
+    chroma_op(dest_cb, src_cb, h->mb_uvlinesize, height >> !(CHROMA422), mx&7, (my << !!(CHROMA422)) &7);
    if(emu){
        s->dsp.emulated_edge_mc(s->edge_emu_buffer, src_cr, h->mb_uvlinesize, 9, (16>>!(CHROMA422)) + 1, (mx>>3), (my>>ysh), pic_width>>1, pic_height>>!(CHROMA422));
            src_cr= s->edge_emu_buffer;
    }
-    chroma_op(dest_cr, src_cr, h->mb_uvlinesize, chroma_height << !!(CHROMA422), mx&7, (my << !!(CHROMA422)) &7);
+    chroma_op(dest_cr, src_cr, h->mb_uvlinesize, height >> !(CHROMA422), mx&7, (my << !!(CHROMA422)) &7);
 }
-static inline void mc_part_std(H264Context *h, int n, int square, int chroma_height, int delta,
+static inline void mc_part_std(H264Context *h, int n, int square, int height, int delta,
                           uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
                           int x_offset, int y_offset,
                           qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
@ -553,7 +554,7 @@ static inline void mc_part_std(H264Context *h, int n, int square, int chroma_hei
    if(list0){
        Picture *ref= &h->ref_list[0][ h->ref_cache[0][ scan8[n] ] ];
-        mc_dir_part(h, ref, n, square, chroma_height, delta, 0,
+        mc_dir_part(h, ref, n, square, height, delta, 0,
                           dest_y, dest_cb, dest_cr, x_offset, y_offset,
                           qpix_op, chroma_op, pixel_shift, chroma444);
@ -563,13 +564,13 @@ static inline void mc_part_std(H264Context *h, int n, int square, int chroma_hei
    if(list1){
        Picture *ref= &h->ref_list[1][ h->ref_cache[1][ scan8[n] ] ];
-        mc_dir_part(h, ref, n, square, chroma_height, delta, 1,
+        mc_dir_part(h, ref, n, square, height, delta, 1,
                           dest_y, dest_cb, dest_cr, x_offset, y_offset,
                           qpix_op, chroma_op, pixel_shift, chroma444);
    }
 }
-static inline void mc_part_weighted(H264Context *h, int n, int square, int chroma_height, int delta,
+static inline void mc_part_weighted(H264Context *h, int n, int square, int height, int delta,
                           uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
                           int x_offset, int y_offset,
                           qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
@ -577,17 +578,21 @@ static inline void mc_part_weighted(H264Context *h, int n, int square, int chrom
                           h264_biweight_func luma_weight_avg, h264_biweight_func chroma_weight_avg,
                           int list0, int list1, int pixel_shift, int chroma444){
    MpegEncContext * const s = &h->s;
    int chroma_height;
    dest_y += (2*x_offset << pixel_shift) + 2*y_offset*h->mb_linesize;
    if(chroma444){
        chroma_height = height;
        chroma_weight_avg = luma_weight_avg;
        chroma_weight_op = luma_weight_op;
        dest_cb += (2*x_offset << pixel_shift) + 2*y_offset*h->mb_linesize;
        dest_cr += (2*x_offset << pixel_shift) + 2*y_offset*h->mb_linesize;
    } else if (CHROMA422) {
        chroma_height = height;
        dest_cb += (  x_offset << pixel_shift) + 2*y_offset*h->mb_uvlinesize;
        dest_cr += (  x_offset << pixel_shift) + 2*y_offset*h->mb_uvlinesize;
    }else{
        chroma_height = height >> 1;
        dest_cb += (  x_offset << pixel_shift) +   y_offset*h->mb_uvlinesize;
        dest_cr += (  x_offset << pixel_shift) +   y_offset*h->mb_uvlinesize;
    }
@ -603,78 +608,53 @@ static inline void mc_part_weighted(H264Context *h, int n, int square, int chrom
        int refn0 = h->ref_cache[0][ scan8[n] ];
        int refn1 = h->ref_cache[1][ scan8[n] ];
-        mc_dir_part(h, &h->ref_list[0][refn0], n, square, chroma_height, delta, 0,
+        mc_dir_part(h, &h->ref_list[0][refn0], n, square, height, delta, 0,
                    dest_y, dest_cb, dest_cr,
                    x_offset, y_offset, qpix_put, chroma_put, pixel_shift, chroma444);
-        mc_dir_part(h, &h->ref_list[1][refn1], n, square, chroma_height, delta, 1,
+        mc_dir_part(h, &h->ref_list[1][refn1], n, square, height, delta, 1,
                    tmp_y, tmp_cb, tmp_cr,
                    x_offset, y_offset, qpix_put, chroma_put, pixel_shift, chroma444);
        if(h->use_weight == 2){
            int weight0 = h->implicit_weight[refn0][refn1][s->mb_y&1];
            int weight1 = 64 - weight0;
-            luma_weight_avg(  dest_y,  tmp_y,  h->  mb_linesize, 5, weight0, weight1, 0);
+            luma_weight_avg(  dest_y,  tmp_y,  h->  mb_linesize,
-            chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, 5, weight0, weight1, 0);
+                              height,        5, weight0, weight1, 0);
-            chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, 5, weight0, weight1, 0);
+            chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize,
-            if (CHROMA422) {
+                              chroma_height, 5, weight0, weight1, 0);
-                chroma_weight_avg(dest_cb + chroma_height * h->mb_uvlinesize,
+            chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize,
-                                  tmp_cb + chroma_height * h->mb_uvlinesize,
+                              chroma_height, 5, weight0, weight1, 0);
                                  h->mb_uvlinesize, 5, weight0, weight1, 0);
                chroma_weight_avg(dest_cr + chroma_height * h->mb_uvlinesize,
                                  tmp_cr + chroma_height * h->mb_uvlinesize,
                                  h->mb_uvlinesize, 5, weight0, weight1, 0);
            }
        }else{
-            luma_weight_avg(dest_y, tmp_y, h->mb_linesize, h->luma_log2_weight_denom,
+            luma_weight_avg(dest_y, tmp_y, h->mb_linesize, height, h->luma_log2_weight_denom,
                            h->luma_weight[refn0][0][0] , h->luma_weight[refn1][1][0],
                            h->luma_weight[refn0][0][1] + h->luma_weight[refn1][1][1]);
-            chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
+            chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, chroma_height, h->chroma_log2_weight_denom,
                            h->chroma_weight[refn0][0][0][0] , h->chroma_weight[refn1][1][0][0],
                            h->chroma_weight[refn0][0][0][1] + h->chroma_weight[refn1][1][0][1]);
            chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
                            h->chroma_weight[refn0][0][1][0] , h->chroma_weight[refn1][1][1][0],
                            h->chroma_weight[refn0][0][1][1] + h->chroma_weight[refn1][1][1][1]);
            if (CHROMA422) {
                chroma_weight_avg(dest_cb + chroma_height * h->mb_uvlinesize,
                                  tmp_cb + chroma_height * h->mb_uvlinesize,
                                  h->mb_uvlinesize, h->chroma_log2_weight_denom,
                            h->chroma_weight[refn0][0][0][0] , h->chroma_weight[refn1][1][0][0],
                            h->chroma_weight[refn0][0][0][1] + h->chroma_weight[refn1][1][0][1]);
-                chroma_weight_avg(dest_cr + chroma_height * h->mb_uvlinesize,
+            chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, chroma_height, h->chroma_log2_weight_denom,
                                  tmp_cr + chroma_height * h->mb_uvlinesize,
                                  h->mb_uvlinesize, h->chroma_log2_weight_denom,
                            h->chroma_weight[refn0][0][1][0] , h->chroma_weight[refn1][1][1][0],
                            h->chroma_weight[refn0][0][1][1] + h->chroma_weight[refn1][1][1][1]);
        }
        }
    }else{
        int list = list1 ? 1 : 0;
        int refn = h->ref_cache[list][ scan8[n] ];
        Picture *ref= &h->ref_list[list][refn];
-        mc_dir_part(h, ref, n, square, chroma_height, delta, list,
+        mc_dir_part(h, ref, n, square, height, delta, list,
                    dest_y, dest_cb, dest_cr, x_offset, y_offset,
                    qpix_put, chroma_put, pixel_shift, chroma444);
-        luma_weight_op(dest_y, h->mb_linesize, h->luma_log2_weight_denom,
+        luma_weight_op(dest_y, h->mb_linesize, height, h->luma_log2_weight_denom,
                       h->luma_weight[refn][list][0], h->luma_weight[refn][list][1]);
        if(h->use_weight_chroma){
-            chroma_weight_op(dest_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
+            chroma_weight_op(dest_cb, h->mb_uvlinesize, chroma_height, h->chroma_log2_weight_denom,
                             h->chroma_weight[refn][list][0][0], h->chroma_weight[refn][list][0][1]);
            chroma_weight_op(dest_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
                             h->chroma_weight[refn][list][1][0], h->chroma_weight[refn][list][1][1]);
            if (CHROMA422) {
                chroma_weight_op(dest_cb + chroma_height * h->mb_uvlinesize,
                                 h->mb_uvlinesize, h->chroma_log2_weight_denom,
                             h->chroma_weight[refn][list][0][0], h->chroma_weight[refn][list][0][1]);
-                chroma_weight_op(dest_cr + chroma_height * h->mb_uvlinesize,
+            chroma_weight_op(dest_cr, h->mb_uvlinesize, chroma_height, h->chroma_log2_weight_denom,
                                 h->mb_uvlinesize, h->chroma_log2_weight_denom,
                             h->chroma_weight[refn][list][1][0], h->chroma_weight[refn][list][1][1]);
        }
    }
 }
 }
-static inline void mc_part(H264Context *h, int n, int square, int chroma_height, int delta,
+static inline void mc_part(H264Context *h, int n, int square, int height, int delta,
                           uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
                           int x_offset, int y_offset,
                           qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
@ -684,12 +664,12 @@ static inline void mc_part(H264Context *h, int n, int square, int chroma_height,
    if((h->use_weight==2 && list0 && list1
        && (h->implicit_weight[ h->ref_cache[0][scan8[n]] ][ h->ref_cache[1][scan8[n]] ][h->s.mb_y&1] != 32))
       || h->use_weight==1)
-        mc_part_weighted(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
+        mc_part_weighted(h, n, square, height, delta, dest_y, dest_cb, dest_cr,
                         x_offset, y_offset, qpix_put, chroma_put,
-                         weight_op[0], weight_op[3], weight_avg[0],
+                         weight_op[0], weight_op[1], weight_avg[0],
-                         weight_avg[3], list0, list1, pixel_shift, chroma444);
+                         weight_avg[1], list0, list1, pixel_shift, chroma444);
    else
-        mc_part_std(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
+        mc_part_std(h, n, square, height, delta, dest_y, dest_cb, dest_cr,
                    x_offset, y_offset, qpix_put, chroma_put, qpix_avg,
                    chroma_avg, list0, list1, pixel_shift, chroma444);
 }
@ -731,31 +711,31 @@ static av_always_inline void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t
    prefetch_motion(h, 0, pixel_shift, chroma444);
    if(IS_16X16(mb_type)){
-        mc_part(h, 0, 1, 8, 0, dest_y, dest_cb, dest_cr, 0, 0,
+        mc_part(h, 0, 1, 16, 0, dest_y, dest_cb, dest_cr, 0, 0,
                qpix_put[0], chroma_put[0], qpix_avg[0], chroma_avg[0],
                weight_op, weight_avg,
                IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1),
                pixel_shift, chroma444);
    }else if(IS_16X8(mb_type)){
-        mc_part(h, 0, 0, 4, 8 << pixel_shift, dest_y, dest_cb, dest_cr, 0, 0,
+        mc_part(h, 0, 0, 8, 8 << pixel_shift, dest_y, dest_cb, dest_cr, 0, 0,
                qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
-                &weight_op[1], &weight_avg[1],
+                weight_op, weight_avg,
                IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1),
                pixel_shift, chroma444);
-        mc_part(h, 8, 0, 4, 8 << pixel_shift, dest_y, dest_cb, dest_cr, 0, 4,
+        mc_part(h, 8, 0, 8, 8 << pixel_shift, dest_y, dest_cb, dest_cr, 0, 4,
                qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
-                &weight_op[1], &weight_avg[1],
+                weight_op, weight_avg,
                IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1),
                pixel_shift, chroma444);
    }else if(IS_8X16(mb_type)){
-        mc_part(h, 0, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 0, 0,
+        mc_part(h, 0, 0, 16, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 0, 0,
                qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
-                &weight_op[2], &weight_avg[2],
+                &weight_op[1], &weight_avg[1],
                IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1),
                pixel_shift, chroma444);
-        mc_part(h, 4, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 4, 0,
+        mc_part(h, 4, 0, 16, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 4, 0,
                qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
-                &weight_op[2], &weight_avg[2],
+                &weight_op[1], &weight_avg[1],
                IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1),
                pixel_shift, chroma444);
    }else{
@ -770,31 +750,31 @@ static av_always_inline void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t
            int y_offset= (i&2)<<1;
            if(IS_SUB_8X8(sub_mb_type)){
-                mc_part(h, n, 1, 4, 0, dest_y, dest_cb, dest_cr, x_offset, y_offset,
+                mc_part(h, n, 1, 8, 0, dest_y, dest_cb, dest_cr, x_offset, y_offset,
                    qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
-                    &weight_op[3], &weight_avg[3],
+                    &weight_op[1], &weight_avg[1],
                    IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1),
                    pixel_shift, chroma444);
            }else if(IS_SUB_8X4(sub_mb_type)){
-                mc_part(h, n  , 0, 2, 4 << pixel_shift, dest_y, dest_cb, dest_cr, x_offset, y_offset,
+                mc_part(h, n  , 0, 4, 4 << pixel_shift, dest_y, dest_cb, dest_cr, x_offset, y_offset,
                    qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
-                    &weight_op[4], &weight_avg[4],
+                    &weight_op[1], &weight_avg[1],
                    IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1),
                    pixel_shift, chroma444);
-                mc_part(h, n+2, 0, 2, 4 << pixel_shift, dest_y, dest_cb, dest_cr, x_offset, y_offset+2,
+                mc_part(h, n+2, 0, 4, 4 << pixel_shift, dest_y, dest_cb, dest_cr, x_offset, y_offset+2,
                    qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
-                    &weight_op[4], &weight_avg[4],
+                    &weight_op[1], &weight_avg[1],
                    IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1),
                    pixel_shift, chroma444);
            }else if(IS_SUB_4X8(sub_mb_type)){
-                mc_part(h, n  , 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset, y_offset,
+                mc_part(h, n  , 0, 8, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset, y_offset,
                    qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
-                    &weight_op[5], &weight_avg[5],
+                    &weight_op[2], &weight_avg[2],
                    IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1),
                    pixel_shift, chroma444);
-                mc_part(h, n+1, 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset+2, y_offset,
+                mc_part(h, n+1, 0, 8, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset+2, y_offset,
                    qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
-                    &weight_op[5], &weight_avg[5],
+                    &weight_op[2], &weight_avg[2],
                    IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1),
                    pixel_shift, chroma444);
            }else{
@ -803,9 +783,9 @@ static av_always_inline void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t
                for(j=0; j<4; j++){
                    int sub_x_offset= x_offset + 2*(j&1);
                    int sub_y_offset= y_offset +   (j&2);
-                    mc_part(h, n+j, 1, 2, 0, dest_y, dest_cb, dest_cr, sub_x_offset, sub_y_offset,
+                    mc_part(h, n+j, 1, 4, 0, dest_y, dest_cb, dest_cr, sub_x_offset, sub_y_offset,
                        qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
-                        &weight_op[6], &weight_avg[6],
+                        &weight_op[2], &weight_avg[2],
                        IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1),
                        pixel_shift, chroma444);
                }
--- a/libavcodec/h264dsp.c
+++ b/libavcodec/h264dsp.c
@ -64,26 +64,14 @@ void ff_h264dsp_init(H264DSPContext *c, const int bit_depth, const int chroma_fo
    else\
        c->h264_chroma_dc_dequant_idct= FUNC(ff_h264_chroma422_dc_dequant_idct, depth);\
 \
-    c->weight_h264_pixels_tab[0]= FUNC(weight_h264_pixels16x16, depth);\
+    c->weight_h264_pixels_tab[0]= FUNC(weight_h264_pixels16, depth);\
-    c->weight_h264_pixels_tab[1]= FUNC(weight_h264_pixels16x8, depth);\
+    c->weight_h264_pixels_tab[1]= FUNC(weight_h264_pixels8, depth);\
-    c->weight_h264_pixels_tab[2]= FUNC(weight_h264_pixels8x16, depth);\
+    c->weight_h264_pixels_tab[2]= FUNC(weight_h264_pixels4, depth);\
-    c->weight_h264_pixels_tab[3]= FUNC(weight_h264_pixels8x8, depth);\
+    c->weight_h264_pixels_tab[3]= FUNC(weight_h264_pixels2, depth);\
-    c->weight_h264_pixels_tab[4]= FUNC(weight_h264_pixels8x4, depth);\
+    c->biweight_h264_pixels_tab[0]= FUNC(biweight_h264_pixels16, depth);\
-    c->weight_h264_pixels_tab[5]= FUNC(weight_h264_pixels4x8, depth);\
+    c->biweight_h264_pixels_tab[1]= FUNC(biweight_h264_pixels8, depth);\
-    c->weight_h264_pixels_tab[6]= FUNC(weight_h264_pixels4x4, depth);\
+    c->biweight_h264_pixels_tab[2]= FUNC(biweight_h264_pixels4, depth);\
-    c->weight_h264_pixels_tab[7]= FUNC(weight_h264_pixels4x2, depth);\
+    c->biweight_h264_pixels_tab[3]= FUNC(biweight_h264_pixels2, depth);\
    c->weight_h264_pixels_tab[8]= FUNC(weight_h264_pixels2x4, depth);\
    c->weight_h264_pixels_tab[9]= FUNC(weight_h264_pixels2x2, depth);\
    c->biweight_h264_pixels_tab[0]= FUNC(biweight_h264_pixels16x16, depth);\
    c->biweight_h264_pixels_tab[1]= FUNC(biweight_h264_pixels16x8, depth);\
    c->biweight_h264_pixels_tab[2]= FUNC(biweight_h264_pixels8x16, depth);\
    c->biweight_h264_pixels_tab[3]= FUNC(biweight_h264_pixels8x8, depth);\
    c->biweight_h264_pixels_tab[4]= FUNC(biweight_h264_pixels8x4, depth);\
    c->biweight_h264_pixels_tab[5]= FUNC(biweight_h264_pixels4x8, depth);\
    c->biweight_h264_pixels_tab[6]= FUNC(biweight_h264_pixels4x4, depth);\
    c->biweight_h264_pixels_tab[7]= FUNC(biweight_h264_pixels4x2, depth);\
    c->biweight_h264_pixels_tab[8]= FUNC(biweight_h264_pixels2x4, depth);\
    c->biweight_h264_pixels_tab[9]= FUNC(biweight_h264_pixels2x2, depth);\
 \
    c->h264_v_loop_filter_luma= FUNC(h264_v_loop_filter_luma, depth);\
    c->h264_h_loop_filter_luma= FUNC(h264_h_loop_filter_luma, depth);\
--- a/libavcodec/h264dsp.h
+++ b/libavcodec/h264dsp.h
@ -31,16 +31,18 @@
 #include "dsputil.h"
 //typedef void (*h264_chroma_mc_func)(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int srcStride, int h, int x, int y);
-typedef void (*h264_weight_func)(uint8_t *block, int stride, int log2_denom, int weight, int offset);
+typedef void (*h264_weight_func)(uint8_t *block, int stride, int height,
-typedef void (*h264_biweight_func)(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset);
+                                 int log2_denom, int weight, int offset);
 typedef void (*h264_biweight_func)(uint8_t *dst, uint8_t *src, int stride, int height,
                                   int log2_denom, int weightd, int weights, int offset);
 /**
 * Context for storing H.264 DSP functions
 */
 typedef struct H264DSPContext{
    /* weighted MC */
-    h264_weight_func weight_h264_pixels_tab[10];
+    h264_weight_func weight_h264_pixels_tab[4];
-    h264_biweight_func biweight_h264_pixels_tab[10];
+    h264_biweight_func biweight_h264_pixels_tab[4];
    /* loop filter */
    void (*h264_v_loop_filter_luma)(uint8_t *pix/*align 16*/, int stride, int alpha, int beta, int8_t *tc0);
--- a/libavcodec/h264dsp_template.c
+++ b/libavcodec/h264dsp_template.c
@ -29,14 +29,16 @@
 #define op_scale1(x)  block[x] = av_clip_pixel( (block[x]*weight + offset) >> log2_denom )
 #define op_scale2(x)  dst[x] = av_clip_pixel( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
-#define H264_WEIGHT(W,H) \
+#define H264_WEIGHT(W) \
-static void FUNCC(weight_h264_pixels ## W ## x ## H)(uint8_t *_block, int stride, int log2_denom, int weight, int offset){ \
+static void FUNCC(weight_h264_pixels ## W)(uint8_t *_block, int stride, int height, \
                                           int log2_denom, int weight, int offset) \
 { \
    int y; \
    pixel *block = (pixel*)_block; \
    stride /= sizeof(pixel); \
    offset <<= (log2_denom + (BIT_DEPTH-8)); \
    if(log2_denom) offset += 1<<(log2_denom-1); \
-    for(y=0; y<H; y++, block += stride){ \
+    for (y = 0; y < height; y++, block += stride) { \
        op_scale1(0); \
        op_scale1(1); \
        if(W==2) continue; \
@ -58,14 +60,16 @@ static void FUNCC(weight_h264_pixels ## W ## x ## H)(uint8_t *_block, int stride
        op_scale1(15); \
    } \
 } \
-static void FUNCC(biweight_h264_pixels ## W ## x ## H)(uint8_t *_dst, uint8_t *_src, int stride, int log2_denom, int weightd, int weights, int offset){ \
+static void FUNCC(biweight_h264_pixels ## W)(uint8_t *_dst, uint8_t *_src, int stride, int height, \
                                             int log2_denom, int weightd, int weights, int offset) \
 { \
    int y; \
    pixel *dst = (pixel*)_dst; \
    pixel *src = (pixel*)_src; \
    stride /= sizeof(pixel); \
    offset <<= (BIT_DEPTH-8); \
    offset = ((offset + 1) | 1) << log2_denom; \
-    for(y=0; y<H; y++, dst += stride, src += stride){ \
+    for (y = 0; y < height; y++, dst += stride, src += stride) { \
        op_scale2(0); \
        op_scale2(1); \
        if(W==2) continue; \
@ -88,16 +92,10 @@ static void FUNCC(biweight_h264_pixels ## W ## x ## H)(uint8_t *_dst, uint8_t *_
    } \
 }
-H264_WEIGHT(16,16)
+H264_WEIGHT(16)
-H264_WEIGHT(16,8)
+H264_WEIGHT(8)
-H264_WEIGHT(8,16)
+H264_WEIGHT(4)
-H264_WEIGHT(8,8)
+H264_WEIGHT(2)
 H264_WEIGHT(8,4)
 H264_WEIGHT(4,8)
 H264_WEIGHT(4,4)
 H264_WEIGHT(4,2)
 H264_WEIGHT(2,4)
 H264_WEIGHT(2,2)
 #undef op_scale1
 #undef op_scale2
--- a/libavcodec/ppc/h264_altivec.c
+++ b/libavcodec/ppc/h264_altivec.c
@ -843,7 +843,8 @@ static void h264_h_loop_filter_luma_altivec(uint8_t *pix, int stride, int alpha,
 }
 static av_always_inline
-void weight_h264_WxH_altivec(uint8_t *block, int stride, int log2_denom, int weight, int offset, int w, int h)
+void weight_h264_W_altivec(uint8_t *block, int stride, int height,
                           int log2_denom, int weight, int offset, int w)
 {
    int y, aligned;
    vec_u8 vblock;
@ -864,7 +865,7 @@ void weight_h264_WxH_altivec(uint8_t *block, int stride, int log2_denom, int wei
    voffset = vec_splat(vtemp, 5);
    aligned = !((unsigned long)block & 0xf);
-    for (y=0; y<h; y++) {
+    for (y = 0; y < height; y++) {
        vblock = vec_ld(0, block);
        v0 = (vec_s16)vec_mergeh(zero_u8v, vblock);
@ -888,8 +889,8 @@ void weight_h264_WxH_altivec(uint8_t *block, int stride, int log2_denom, int wei
 }
 static av_always_inline
-void biweight_h264_WxH_altivec(uint8_t *dst, uint8_t *src, int stride, int log2_denom,
+void biweight_h264_W_altivec(uint8_t *dst, uint8_t *src, int stride, int height,
-                               int weightd, int weights, int offset, int w, int h)
+                             int log2_denom, int weightd, int weights, int offset, int w)
 {
    int y, dst_aligned, src_aligned;
    vec_u8 vsrc, vdst;
@ -912,7 +913,7 @@ void biweight_h264_WxH_altivec(uint8_t *dst, uint8_t *src, int stride, int log2_
    dst_aligned = !((unsigned long)dst & 0xf);
    src_aligned = !((unsigned long)src & 0xf);
-    for (y=0; y<h; y++) {
+    for (y = 0; y < height; y++) {
        vdst = vec_ld(0, dst);
        vsrc = vec_ld(0, src);
@ -952,19 +953,18 @@ void biweight_h264_WxH_altivec(uint8_t *dst, uint8_t *src, int stride, int log2_
    }
 }
-#define H264_WEIGHT(W,H) \
+#define H264_WEIGHT(W) \
-static void ff_weight_h264_pixels ## W ## x ## H ## _altivec(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
+static void ff_weight_h264_pixels ## W ## _altivec(uint8_t *block, int stride, int height, \
-    weight_h264_WxH_altivec(block, stride, log2_denom, weight, offset, W, H); \
+                                                   int log2_denom, int weight, int offset){ \
    weight_h264_WxH_altivec(block, stride, height, log2_denom, weight, offset, W); \
 }\
-static void ff_biweight_h264_pixels ## W ## x ## H ## _altivec(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
+static void ff_biweight_h264_pixels ## W ## _altivec(uint8_t *dst, uint8_t *src, int stride, int height, \
-    biweight_h264_WxH_altivec(dst, src, stride, log2_denom, weightd, weights, offset, W, H); \
+                                                     int log2_denom, int weightd, int weights, int offset){ \
    biweight_h264_WxH_altivec(dst, src, stride, height, log2_denom, weightd, weights, offset, W); \
 }
-H264_WEIGHT(16,16)
+H264_WEIGHT(16)
-H264_WEIGHT(16, 8)
+H264_WEIGHT( 8)
 H264_WEIGHT( 8,16)
 H264_WEIGHT( 8, 8)
 H264_WEIGHT( 8, 4)
 void dsputil_h264_init_ppc(DSPContext* c, AVCodecContext *avctx) {
    const int high_bit_depth = avctx->bits_per_raw_sample > 8;
@ -1015,16 +1015,10 @@ void ff_h264dsp_init_ppc(H264DSPContext *c, const int bit_depth, const int chrom
        c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_altivec;
        c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_altivec;
-        c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels16x16_altivec;
+        c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels16_altivec;
-        c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels16x8_altivec;
+        c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels8_altivec;
-        c->weight_h264_pixels_tab[2] = ff_weight_h264_pixels8x16_altivec;
+        c->biweight_h264_pixels_tab[0] = ff_biweight_h264_pixels16_altivec;
-        c->weight_h264_pixels_tab[3] = ff_weight_h264_pixels8x8_altivec;
+        c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels8_altivec;
        c->weight_h264_pixels_tab[4] = ff_weight_h264_pixels8x4_altivec;
        c->biweight_h264_pixels_tab[0] = ff_biweight_h264_pixels16x16_altivec;
        c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels16x8_altivec;
        c->biweight_h264_pixels_tab[2] = ff_biweight_h264_pixels8x16_altivec;
        c->biweight_h264_pixels_tab[3] = ff_biweight_h264_pixels8x8_altivec;
        c->biweight_h264_pixels_tab[4] = ff_biweight_h264_pixels8x4_altivec;
    }
    }
 }
--- a/libavcodec/x86/h264_weight.asm
+++ b/libavcodec/x86/h264_weight.asm
@ -28,21 +28,20 @@ SECTION .text
 ;-----------------------------------------------------------------------------
 ; biweight pred:
 ;
-; void h264_biweight_16x16_sse2(uint8_t *dst, uint8_t *src, int stride,
+; void h264_biweight_16_sse2(uint8_t *dst, uint8_t *src, int stride,
-;                               int log2_denom, int weightd, int weights,
+;                            int height, int log2_denom, int weightd,
-;                               int offset);
+;                            int weights, int offset);
 ; and
-; void h264_weight_16x16_sse2(uint8_t *dst, int stride,
+; void h264_weight_16_sse2(uint8_t *dst, int stride, int height,
-;                             int log2_denom, int weight,
+;                          int log2_denom, int weight, int offset);
 ;                             int offset);
 ;-----------------------------------------------------------------------------
 %macro WEIGHT_SETUP 0
-    add        r4, r4
+    add        r5, r5
-    inc        r4
+    inc        r5
-    movd       m3, r3d
+    movd       m3, r4d
-    movd       m5, r4d
+    movd       m5, r5d
-    movd       m6, r2d
+    movd       m6, r3d
    pslld      m5, m6
    psrld      m5, 1
 %if mmsize == 16
@ -71,60 +70,41 @@ SECTION .text
    packuswb      m0, m1
 %endmacro
-%macro WEIGHT_FUNC_DBL_MM 1
+INIT_MMX
-cglobal h264_weight_16x%1_mmx2, 5, 5, 0
+cglobal h264_weight_16_mmx2, 6, 6, 0
    WEIGHT_SETUP
    mov        r2, %1
 %if %1 == 16
 .nextrow
    WEIGHT_OP 0,  4
    mova     [r0  ], m0
    WEIGHT_OP 8, 12
    mova     [r0+8], m0
    add        r0, r1
-    dec        r2
+    dec        r2d
    jnz .nextrow
    REP_RET
 %else
    jmp mangle(ff_h264_weight_16x16_mmx2.nextrow)
 %endif
 %endmacro
-INIT_MMX
+%macro WEIGHT_FUNC_MM 3
-WEIGHT_FUNC_DBL_MM 16
+cglobal h264_weight_%1_%3, 6, 6, %2
 WEIGHT_FUNC_DBL_MM  8
 %macro WEIGHT_FUNC_MM 4
 cglobal h264_weight_%1x%2_%4, 7, 7, %3
    WEIGHT_SETUP
    mov        r2, %2
 %if %2 == 16
 .nextrow
    WEIGHT_OP 0, mmsize/2
    mova     [r0], m0
    add        r0, r1
-    dec        r2
+    dec        r2d
    jnz .nextrow
    REP_RET
 %else
    jmp mangle(ff_h264_weight_%1x16_%4.nextrow)
 %endif
 %endmacro
 INIT_MMX
-WEIGHT_FUNC_MM  8, 16,  0, mmx2
+WEIGHT_FUNC_MM  8, 0, mmx2
 WEIGHT_FUNC_MM  8,  8,  0, mmx2
 WEIGHT_FUNC_MM  8,  4,  0, mmx2
 INIT_XMM
-WEIGHT_FUNC_MM 16, 16,  8, sse2
+WEIGHT_FUNC_MM 16, 8, sse2
 WEIGHT_FUNC_MM 16,  8,  8, sse2
-%macro WEIGHT_FUNC_HALF_MM 5
+%macro WEIGHT_FUNC_HALF_MM 3
-cglobal h264_weight_%1x%2_%5, 5, 5, %4
+cglobal h264_weight_%1_%3, 6, 6, %2
    WEIGHT_SETUP
-    mov        r2, %2/2
+    sar       r2d, 1
    lea        r3, [r1*2]
 %if %2 == mmsize
 .nextrow
    WEIGHT_OP 0, r1
    movh     [r0], m0
@ -135,31 +115,34 @@ cglobal h264_weight_%1x%2_%5, 5, 5, %4
    movh     [r0+r1], m0
 %endif
    add        r0, r3
-    dec        r2
+    dec        r2d
    jnz .nextrow
    REP_RET
 %else
    jmp mangle(ff_h264_weight_%1x%3_%5.nextrow)
 %endif
 %endmacro
 INIT_MMX
-WEIGHT_FUNC_HALF_MM 4,  8,  8, 0, mmx2
+WEIGHT_FUNC_HALF_MM 4, 0, mmx2
-WEIGHT_FUNC_HALF_MM 4,  4,  8, 0, mmx2
+WEIGHT_FUNC_HALF_MM 4, 0, mmx2
-WEIGHT_FUNC_HALF_MM 4,  2,  8, 0, mmx2
+WEIGHT_FUNC_HALF_MM 4, 0, mmx2
 INIT_XMM
-WEIGHT_FUNC_HALF_MM 8, 16, 16, 8, sse2
+WEIGHT_FUNC_HALF_MM 8, 8, sse2
-WEIGHT_FUNC_HALF_MM 8,  8, 16, 8, sse2
+WEIGHT_FUNC_HALF_MM 8, 8, sse2
-WEIGHT_FUNC_HALF_MM 8,  4, 16, 8, sse2
+WEIGHT_FUNC_HALF_MM 8, 8, sse2
 %macro BIWEIGHT_SETUP 0
-    add        r6, 1
+%ifdef ARCH_X86_64
-    or         r6, 1
+%define off_regd r11d
-    add        r3, 1
+%else
-    movd       m3, r4d
+%define off_regd r3d
-    movd       m4, r5d
+%endif
-    movd       m5, r6d
+    mov  off_regd, r7m
-    movd       m6, r3d
+    add  off_regd, 1
    or   off_regd, 1
    add        r4, 1
    movd       m3, r5d
    movd       m4, r6d
    movd       m5, off_regd
    movd       m6, r4d
    pslld      m5, m6
    psrld      m5, 1
 %if mmsize == 16
@ -195,11 +178,10 @@ WEIGHT_FUNC_HALF_MM 8,  4, 16, 8, sse2
    packuswb   m0, m1
 %endmacro
-%macro BIWEIGHT_FUNC_DBL_MM 1
+INIT_MMX
-cglobal h264_biweight_16x%1_mmx2, 7, 7, 0
+cglobal h264_biweight_16_mmx2, 7, 7, 0
    BIWEIGHT_SETUP
-    mov        r3, %1
+    movifnidn r3d, r3m
 %if %1 == 16
 .nextrow
    BIWEIGHT_STEPA 0, 1, 0
    BIWEIGHT_STEPA 1, 2, 4
@ -211,23 +193,14 @@ cglobal h264_biweight_16x%1_mmx2, 7, 7, 0
    mova     [r0+8], m0
    add        r0, r2
    add        r1, r2
-    dec        r3
+    dec        r3d
    jnz .nextrow
    REP_RET
 %else
    jmp mangle(ff_h264_biweight_16x16_mmx2.nextrow)
 %endif
 %endmacro
-INIT_MMX
+%macro BIWEIGHT_FUNC_MM 3
-BIWEIGHT_FUNC_DBL_MM 16
+cglobal h264_biweight_%1_%3, 7, 7, %2
 BIWEIGHT_FUNC_DBL_MM  8
 %macro BIWEIGHT_FUNC_MM 4
 cglobal h264_biweight_%1x%2_%4, 7, 7, %3
    BIWEIGHT_SETUP
-    mov        r3, %2
+    movifnidn r3d, r3m
 %if %2 == 16
 .nextrow
    BIWEIGHT_STEPA 0, 1, 0
    BIWEIGHT_STEPA 1, 2, mmsize/2
@ -235,28 +208,22 @@ cglobal h264_biweight_%1x%2_%4, 7, 7, %3
    mova       [r0], m0
    add        r0, r2
    add        r1, r2
-    dec        r3
+    dec        r3d
    jnz .nextrow
    REP_RET
 %else
    jmp mangle(ff_h264_biweight_%1x16_%4.nextrow)
 %endif
 %endmacro
 INIT_MMX
-BIWEIGHT_FUNC_MM  8, 16,  0, mmx2
+BIWEIGHT_FUNC_MM  8, 0, mmx2
 BIWEIGHT_FUNC_MM  8,  8,  0, mmx2
 BIWEIGHT_FUNC_MM  8,  4,  0, mmx2
 INIT_XMM
-BIWEIGHT_FUNC_MM 16, 16,  8, sse2
+BIWEIGHT_FUNC_MM 16, 8, sse2
 BIWEIGHT_FUNC_MM 16,  8,  8, sse2
-%macro BIWEIGHT_FUNC_HALF_MM 5
+%macro BIWEIGHT_FUNC_HALF_MM 3
-cglobal h264_biweight_%1x%2_%5, 7, 7, %4
+cglobal h264_biweight_%1_%3, 7, 7, %2
    BIWEIGHT_SETUP
-    mov        r3, %2/2
+    movifnidn r3d, r3m
    sar        r3, 1
    lea        r4, [r2*2]
 %if %2 == mmsize
 .nextrow
    BIWEIGHT_STEPA 0, 1, 0
    BIWEIGHT_STEPA 1, 2, r2
@ -270,31 +237,30 @@ cglobal h264_biweight_%1x%2_%5, 7, 7, %4
 %endif
    add        r0, r4
    add        r1, r4
-    dec        r3
+    dec        r3d
    jnz .nextrow
    REP_RET
 %else
    jmp mangle(ff_h264_biweight_%1x%3_%5.nextrow)
 %endif
 %endmacro
 INIT_MMX
-BIWEIGHT_FUNC_HALF_MM 4,  8,  8, 0, mmx2
+BIWEIGHT_FUNC_HALF_MM 4, 0, mmx2
 BIWEIGHT_FUNC_HALF_MM 4,  4,  8, 0, mmx2
 BIWEIGHT_FUNC_HALF_MM 4,  2,  8, 0, mmx2
 INIT_XMM
-BIWEIGHT_FUNC_HALF_MM 8, 16, 16, 8, sse2
+BIWEIGHT_FUNC_HALF_MM 8, 8, sse2
 BIWEIGHT_FUNC_HALF_MM 8,  8, 16, 8, sse2
 BIWEIGHT_FUNC_HALF_MM 8,  4, 16, 8, sse2
 %macro BIWEIGHT_SSSE3_SETUP 0
-    add        r6, 1
+%ifdef ARCH_X86_64
-    or         r6, 1
+%define off_regd r11d
-    add        r3, 1
+%else
-    movd       m4, r4d
+%define off_regd r3d
-    movd       m0, r5d
+%endif
-    movd       m5, r6d
+    mov  off_regd, r7m
-    movd       m6, r3d
+    add  off_regd, 1
    or   off_regd, 1
    add        r4, 1
    movd       m4, r5d
    movd       m0, r6d
    movd       m5, off_regd
    movd       m6, r4d
    pslld      m5, m6
    psrld      m5, 1
    punpcklbw  m4, m0
@ -314,12 +280,11 @@ BIWEIGHT_FUNC_HALF_MM 8,  4, 16, 8, sse2
    packuswb   m0, m2
 %endmacro
-%macro BIWEIGHT_SSSE3_16 1
+INIT_XMM
-cglobal h264_biweight_16x%1_ssse3, 7, 7, 8
+cglobal h264_biweight_16_ssse3, 7, 7, 8
    BIWEIGHT_SSSE3_SETUP
-    mov        r3, %1
+    movifnidn r3d, r3m
 %if %1 == 16
 .nextrow
    movh       m0, [r0]
    movh       m2, [r0+8]
@ -330,25 +295,17 @@ cglobal h264_biweight_16x%1_ssse3, 7, 7, 8
    mova       [r0], m0
    add        r0, r2
    add        r1, r2
-    dec        r3
+    dec        r3d
    jnz .nextrow
    REP_RET
 %else
    jmp mangle(ff_h264_biweight_16x16_ssse3.nextrow)
 %endif
 %endmacro
 INIT_XMM
-BIWEIGHT_SSSE3_16 16
+cglobal h264_biweight_8_ssse3, 7, 7, 8
 BIWEIGHT_SSSE3_16  8
 %macro BIWEIGHT_SSSE3_8 1
 cglobal h264_biweight_8x%1_ssse3, 7, 7, 8
    BIWEIGHT_SSSE3_SETUP
-    mov        r3, %1/2
+    movifnidn r3d, r3m
    sar        r3, 1
    lea        r4, [r2*2]
 %if %1 == 16
 .nextrow
    movh       m0, [r0]
    movh       m1, [r1]
@ -361,15 +318,6 @@ cglobal h264_biweight_8x%1_ssse3, 7, 7, 8
    movhps     [r0+r2], m0
    add        r0, r4
    add        r1, r4
-    dec        r3
+    dec        r3d
    jnz .nextrow
    REP_RET
 %else
    jmp mangle(ff_h264_biweight_8x16_ssse3.nextrow)
 %endif
 %endmacro
 INIT_XMM
 BIWEIGHT_SSSE3_8 16
 BIWEIGHT_SSSE3_8  8
 BIWEIGHT_SSSE3_8  4
--- a/libavcodec/x86/h264_weight_10bit.asm
+++ b/libavcodec/x86/h264_weight_10bit.asm
@ -36,33 +36,26 @@ cextern pw_1
 SECTION .text
 ;-----------------------------------------------------------------------------
-; void h264_weight(uint8_t *dst, int stride, int log2_denom,
+; void h264_weight(uint8_t *dst, int stride, int height, int log2_denom,
 ;                  int weight, int offset);
 ;-----------------------------------------------------------------------------
-%ifdef ARCH_X86_32
+%macro WEIGHT_PROLOGUE 0
 DECLARE_REG_TMP 2
 %else
 DECLARE_REG_TMP 10
 %endif
 %macro WEIGHT_PROLOGUE 1
    mov t0, %1
 .prologue
-    PROLOGUE 0,5,8
+    PROLOGUE 0,6,8
    movifnidn  r0, r0mp
    movifnidn r1d, r1m
    movifnidn r3d, r3m
    movifnidn r4d, r4m
    movifnidn r5d, r5m
 %endmacro
 %macro WEIGHT_SETUP 1
    mova       m0, [pw_1]
-    movd       m2, r2m
+    movd       m2, r3m
    pslld      m0, m2       ; 1<<log2_denom
    SPLATW     m0, m0
-    shl        r4, 19       ; *8, move to upper half of dword
+    shl        r5, 19       ; *8, move to upper half of dword
-    lea        r4, [r4+r3*2+0x10000]
+    lea        r5, [r5+r4*2+0x10000]
-    movd       m3, r4d      ; weight<<1 | 1+(offset<<(3))
+    movd       m3, r5d      ; weight<<1 | 1+(offset<<(3))
    pshufd     m3, m3, 0
    mova       m4, [pw_pixel_max]
    paddw      m2, [sq_1]   ; log2_denom+1
@ -96,8 +89,8 @@ DECLARE_REG_TMP 10
 %endmacro
 %macro WEIGHT_FUNC_DBL 1
-cglobal h264_weight_16x16_10_%1
+cglobal h264_weight_16_10_%1
-    WEIGHT_PROLOGUE 16
+    WEIGHT_PROLOGUE
    WEIGHT_SETUP %1
 .nextrow
    WEIGHT_OP %1,  0
@ -105,13 +98,9 @@ cglobal h264_weight_16x16_10_%1
    WEIGHT_OP %1, 16
    mova [r0+16], m5
    add       r0, r1
-    dec       t0
+    dec       r2d
    jnz .nextrow
    REP_RET
 cglobal h264_weight_16x8_10_%1
    mov t0, 8
    jmp mangle(ff_h264_weight_16x16_10_%1.prologue)
 %endmacro
 INIT_XMM
@ -120,24 +109,16 @@ WEIGHT_FUNC_DBL sse4
 %macro WEIGHT_FUNC_MM 1
-cglobal h264_weight_8x16_10_%1
+cglobal h264_weight_8_10_%1
-    WEIGHT_PROLOGUE 16
+    WEIGHT_PROLOGUE
    WEIGHT_SETUP %1
 .nextrow
    WEIGHT_OP  %1, 0
    mova     [r0], m5
    add        r0, r1
-    dec        t0
+    dec        r2d
    jnz .nextrow
    REP_RET
 cglobal h264_weight_8x8_10_%1
    mov t0, 8
    jmp mangle(ff_h264_weight_8x16_10_%1.prologue)
 cglobal h264_weight_8x4_10_%1
    mov t0, 4
    jmp mangle(ff_h264_weight_8x16_10_%1.prologue)
 %endmacro
 INIT_XMM
@ -146,8 +127,9 @@ WEIGHT_FUNC_MM sse4
 %macro WEIGHT_FUNC_HALF_MM 1
-cglobal h264_weight_4x8_10_%1
+cglobal h264_weight_4_10_%1
-    WEIGHT_PROLOGUE 4
+    WEIGHT_PROLOGUE
    sar         r2d, 1
    WEIGHT_SETUP %1
    lea         r3, [r1*2]
 .nextrow
@ -155,17 +137,9 @@ cglobal h264_weight_4x8_10_%1
    movh      [r0], m5
    movhps [r0+r1], m5
    add         r0, r3
-    dec         t0
+    dec         r2d
    jnz .nextrow
    REP_RET
 cglobal h264_weight_4x4_10_%1
    mov t0, 2
    jmp mangle(ff_h264_weight_4x8_10_%1.prologue)
 cglobal h264_weight_4x2_10_%1
    mov t0, 1
    jmp mangle(ff_h264_weight_4x8_10_%1.prologue)
 %endmacro
 INIT_XMM
@ -174,40 +148,40 @@ WEIGHT_FUNC_HALF_MM sse4
 ;-----------------------------------------------------------------------------
-; void h264_biweight(uint8_t *dst, uint8_t *src, int stride, int log2_denom,
+; void h264_biweight(uint8_t *dst, uint8_t *src, int stride, int height,
-;                    int weightd, int weights, int offset);
+;                    int log2_denom, int weightd, int weights, int offset);
 ;-----------------------------------------------------------------------------
 %ifdef ARCH_X86_32
-DECLARE_REG_TMP 2,3
+DECLARE_REG_TMP 3
 %else
-DECLARE_REG_TMP 10,2
+DECLARE_REG_TMP 10
 %endif
-%macro BIWEIGHT_PROLOGUE 1
+%macro BIWEIGHT_PROLOGUE 0
    mov t0, %1
 .prologue
    PROLOGUE 0,7,8
    movifnidn  r0, r0mp
    movifnidn  r1, r1mp
-    movifnidn t1d, r2m
+    movifnidn r2d, r2m
    movifnidn r4d, r4m
    movifnidn r5d, r5m
    movifnidn r6d, r6m
    movifnidn t0d, r7m
 %endmacro
 %macro BIWEIGHT_SETUP 1
-    lea        r6, [r6*4+1] ; (offset<<2)+1
+    lea        t0, [t0*4+1] ; (offset<<2)+1
-    or         r6, 1
+    or         t0, 1
-    shl        r5, 16
+    shl        r6, 16
-    or         r4, r5
+    or         r5, r6
-    movd       m4, r4d      ; weightd | weights
+    movd       m4, r5d      ; weightd | weights
-    movd       m5, r6d      ; (offset+1)|1
+    movd       m5, t0d      ; (offset+1)|1
-    movd       m6, r3m      ; log2_denom
+    movd       m6, r4m      ; log2_denom
    pslld      m5, m6       ; (((offset<<2)+1)|1)<<log2_denom
    paddd      m6, [sq_1]
    pshufd     m4, m4, 0
    pshufd     m5, m5, 0
    mova       m3, [pw_pixel_max]
    movifnidn r3d, r3m
 %ifnidn %1, sse4
    pxor       m7, m7
 %endif
@ -243,23 +217,19 @@ DECLARE_REG_TMP 10,2
 %endmacro
 %macro BIWEIGHT_FUNC_DBL 1
-cglobal h264_biweight_16x16_10_%1
+cglobal h264_biweight_16_10_%1
-    BIWEIGHT_PROLOGUE 16
+    BIWEIGHT_PROLOGUE
    BIWEIGHT_SETUP %1
 .nextrow
    BIWEIGHT  %1,  0
    mova [r0   ], m0
    BIWEIGHT  %1, 16
    mova [r0+16], m0
-    add       r0, t1
+    add       r0, r2
-    add       r1, t1
+    add       r1, r2
-    dec       t0
+    dec       r3d
    jnz .nextrow
    REP_RET
 cglobal h264_biweight_16x8_10_%1
    mov t0, 8
    jmp mangle(ff_h264_biweight_16x16_10_%1.prologue)
 %endmacro
 INIT_XMM
@ -267,25 +237,17 @@ BIWEIGHT_FUNC_DBL sse2
 BIWEIGHT_FUNC_DBL sse4
 %macro BIWEIGHT_FUNC 1
-cglobal h264_biweight_8x16_10_%1
+cglobal h264_biweight_8_10_%1
-    BIWEIGHT_PROLOGUE 16
+    BIWEIGHT_PROLOGUE
    BIWEIGHT_SETUP %1
 .nextrow
    BIWEIGHT %1, 0
    mova   [r0], m0
-    add      r0, t1
+    add      r0, r2
-    add      r1, t1
+    add      r1, r2
-    dec      t0
+    dec      r3d
    jnz .nextrow
    REP_RET
 cglobal h264_biweight_8x8_10_%1
    mov t0, 8
    jmp mangle(ff_h264_biweight_8x16_10_%1.prologue)
 cglobal h264_biweight_8x4_10_%1
    mov t0, 4
    jmp mangle(ff_h264_biweight_8x16_10_%1.prologue)
 %endmacro
 INIT_XMM
@ -293,27 +255,20 @@ BIWEIGHT_FUNC sse2
 BIWEIGHT_FUNC sse4
 %macro BIWEIGHT_FUNC_HALF 1
-cglobal h264_biweight_4x8_10_%1
+cglobal h264_biweight_4_10_%1
-    BIWEIGHT_PROLOGUE 4
+    BIWEIGHT_PROLOGUE
    BIWEIGHT_SETUP %1
-    lea        r4, [t1*2]
+    sar        r3d, 1
    lea        r4, [r2*2]
 .nextrow
-    BIWEIGHT    %1, 0, t1
+    BIWEIGHT    %1, 0, r2
    movh   [r0   ], m0
-    movhps [r0+t1], m0
+    movhps [r0+r2], m0
    add         r0, r4
    add         r1, r4
-    dec         t0
+    dec         r3d
    jnz .nextrow
    REP_RET
 cglobal h264_biweight_4x4_10_%1
    mov t0, 2
    jmp mangle(ff_h264_biweight_4x8_10_%1.prologue)
 cglobal h264_biweight_4x2_10_%1
    mov t0, 1
    jmp mangle(ff_h264_biweight_4x8_10_%1.prologue)
 %endmacro
 INIT_XMM
--- a/libavcodec/x86/h264dsp_mmx.c
+++ b/libavcodec/x86/h264dsp_mmx.c
@ -298,57 +298,47 @@ LF_IFUNC(v,  luma_intra,      10, mmxext)
 /***********************************/
 /* weighted prediction */
-#define H264_WEIGHT(W, H, OPT) \
+#define H264_WEIGHT(W, OPT) \
-void ff_h264_weight_ ## W ## x ## H ## _ ## OPT(uint8_t *dst, \
+void ff_h264_weight_ ## W ## _ ## OPT(uint8_t *dst, \
-    int stride, int log2_denom, int weight, int offset);
+    int stride, int height, int log2_denom, int weight, int offset);
-#define H264_BIWEIGHT(W, H, OPT) \
+#define H264_BIWEIGHT(W, OPT) \
-void ff_h264_biweight_ ## W ## x ## H ## _ ## OPT(uint8_t *dst, \
+void ff_h264_biweight_ ## W ## _ ## OPT(uint8_t *dst, \
-    uint8_t *src, int stride, int log2_denom, int weightd, \
+    uint8_t *src, int stride, int height, int log2_denom, int weightd, \
    int weights, int offset);
-#define H264_BIWEIGHT_MMX(W,H) \
+#define H264_BIWEIGHT_MMX(W) \
-H264_WEIGHT  (W, H, mmx2) \
+H264_WEIGHT  (W, mmx2) \
-H264_BIWEIGHT(W, H, mmx2)
+H264_BIWEIGHT(W, mmx2)
-
+
-#define H264_BIWEIGHT_MMX_SSE(W,H) \
+#define H264_BIWEIGHT_MMX_SSE(W) \
-H264_BIWEIGHT_MMX(W, H) \
+H264_BIWEIGHT_MMX(W) \
-H264_WEIGHT      (W, H, sse2) \
+H264_WEIGHT      (W, sse2) \
-H264_BIWEIGHT    (W, H, sse2) \
+H264_BIWEIGHT    (W, sse2) \
-H264_BIWEIGHT    (W, H, ssse3)
+H264_BIWEIGHT    (W, ssse3)
-
+
-H264_BIWEIGHT_MMX_SSE(16, 16)
+H264_BIWEIGHT_MMX_SSE(16)
-H264_BIWEIGHT_MMX_SSE(16,  8)
+H264_BIWEIGHT_MMX_SSE( 8)
-H264_BIWEIGHT_MMX_SSE( 8, 16)
+H264_BIWEIGHT_MMX    ( 4)
-H264_BIWEIGHT_MMX_SSE( 8,  8)
+
-H264_BIWEIGHT_MMX_SSE( 8,  4)
+#define H264_WEIGHT_10(W, DEPTH, OPT) \
-H264_BIWEIGHT_MMX    ( 4,  8)
+void ff_h264_weight_ ## W ## _ ## DEPTH ## _ ## OPT(uint8_t *dst, \
-H264_BIWEIGHT_MMX    ( 4,  4)
+    int stride, int height, int log2_denom, int weight, int offset);
-H264_BIWEIGHT_MMX    ( 4,  2)
+
-
+#define H264_BIWEIGHT_10(W, DEPTH, OPT) \
-#define H264_WEIGHT_10(W, H, DEPTH, OPT) \
+void ff_h264_biweight_ ## W ## _ ## DEPTH ## _ ## OPT \
-void ff_h264_weight_ ## W ## x ## H ## _ ## DEPTH ## _ ## OPT(uint8_t *dst, \
+    (uint8_t *dst, uint8_t *src, int stride, int height, int log2_denom, \
    int stride, int log2_denom, int weight, int offset);
 #define H264_BIWEIGHT_10(W, H, DEPTH, OPT) \
 void ff_h264_biweight_ ## W ## x ## H ## _ ## DEPTH ## _ ## OPT \
    (uint8_t *dst, uint8_t *src, int stride, int log2_denom, \
     int weightd, int weights, int offset);
-#define H264_BIWEIGHT_10_SSE(W, H, DEPTH) \
+#define H264_BIWEIGHT_10_SSE(W, DEPTH) \
-H264_WEIGHT_10  (W, H, DEPTH, sse2) \
+H264_WEIGHT_10  (W, DEPTH, sse2) \
-H264_WEIGHT_10  (W, H, DEPTH, sse4) \
+H264_WEIGHT_10  (W, DEPTH, sse4) \
-H264_BIWEIGHT_10(W, H, DEPTH, sse2) \
+H264_BIWEIGHT_10(W, DEPTH, sse2) \
-H264_BIWEIGHT_10(W, H, DEPTH, sse4)
+H264_BIWEIGHT_10(W, DEPTH, sse4)
-
+
-H264_BIWEIGHT_10_SSE(16, 16, 10)
+H264_BIWEIGHT_10_SSE(16, 10)
-H264_BIWEIGHT_10_SSE(16,  8, 10)
+H264_BIWEIGHT_10_SSE( 8, 10)
-H264_BIWEIGHT_10_SSE( 8, 16, 10)
+H264_BIWEIGHT_10_SSE( 4, 10)
 H264_BIWEIGHT_10_SSE( 8,  8, 10)
 H264_BIWEIGHT_10_SSE( 8,  4, 10)
 H264_BIWEIGHT_10_SSE( 4,  8, 10)
 H264_BIWEIGHT_10_SSE( 4,  4, 10)
 H264_BIWEIGHT_10_SSE( 4,  2, 10)
 void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, const int chroma_format_idc)
 {
@ -394,23 +384,13 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, const int chrom
            c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_mmxext;
            c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_mmxext;
 #endif
-            c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_mmx2;
+            c->weight_h264_pixels_tab[0]= ff_h264_weight_16_mmx2;
-            c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_mmx2;
+            c->weight_h264_pixels_tab[1]= ff_h264_weight_8_mmx2;
-            c->weight_h264_pixels_tab[2]= ff_h264_weight_8x16_mmx2;
+            c->weight_h264_pixels_tab[2]= ff_h264_weight_4_mmx2;
-            c->weight_h264_pixels_tab[3]= ff_h264_weight_8x8_mmx2;
+
-            c->weight_h264_pixels_tab[4]= ff_h264_weight_8x4_mmx2;
+            c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16_mmx2;
-            c->weight_h264_pixels_tab[5]= ff_h264_weight_4x8_mmx2;
+            c->biweight_h264_pixels_tab[1]= ff_h264_biweight_8_mmx2;
-            c->weight_h264_pixels_tab[6]= ff_h264_weight_4x4_mmx2;
+            c->biweight_h264_pixels_tab[2]= ff_h264_biweight_4_mmx2;
            c->weight_h264_pixels_tab[7]= ff_h264_weight_4x2_mmx2;
            c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_mmx2;
            c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_mmx2;
            c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_mmx2;
            c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_mmx2;
            c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_mmx2;
            c->biweight_h264_pixels_tab[5]= ff_h264_biweight_4x8_mmx2;
            c->biweight_h264_pixels_tab[6]= ff_h264_biweight_4x4_mmx2;
            c->biweight_h264_pixels_tab[7]= ff_h264_biweight_4x2_mmx2;
            if (mm_flags&AV_CPU_FLAG_SSE2) {
                c->h264_idct8_add           = ff_h264_idct8_add_8_sse2;
@ -422,17 +402,11 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, const int chrom
                c->h264_idct_add16intra     = ff_h264_idct_add16intra_8_sse2;
                c->h264_luma_dc_dequant_idct= ff_h264_luma_dc_dequant_idct_sse2;
-                c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_sse2;
+                c->weight_h264_pixels_tab[0]= ff_h264_weight_16_sse2;
-                c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_sse2;
+                c->weight_h264_pixels_tab[1]= ff_h264_weight_8_sse2;
                c->weight_h264_pixels_tab[2]= ff_h264_weight_8x16_sse2;
                c->weight_h264_pixels_tab[3]= ff_h264_weight_8x8_sse2;
                c->weight_h264_pixels_tab[4]= ff_h264_weight_8x4_sse2;
-                c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_sse2;
+                c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16_sse2;
-                c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_sse2;
+                c->biweight_h264_pixels_tab[1]= ff_h264_biweight_8_sse2;
                c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_sse2;
                c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_sse2;
                c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_sse2;
 #if HAVE_ALIGNED_STACK
                c->h264_v_loop_filter_luma = ff_deblock_v_luma_8_sse2;
@ -442,11 +416,8 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, const int chrom
 #endif
            }
            if (mm_flags&AV_CPU_FLAG_SSSE3) {
-                c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_ssse3;
+                c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16_ssse3;
-                c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_ssse3;
+                c->biweight_h264_pixels_tab[1]= ff_h264_biweight_8_ssse3;
                c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_ssse3;
                c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_ssse3;
                c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_ssse3;
            }
            if (mm_flags&AV_CPU_FLAG_AVX) {
 #if HAVE_ALIGNED_STACK
@ -485,23 +456,13 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, const int chrom
                c->h264_idct8_add4     = ff_h264_idct8_add4_10_sse2;
 #endif
-                c->weight_h264_pixels_tab[0] = ff_h264_weight_16x16_10_sse2;
+                c->weight_h264_pixels_tab[0] = ff_h264_weight_16_10_sse2;
-                c->weight_h264_pixels_tab[1] = ff_h264_weight_16x8_10_sse2;
+                c->weight_h264_pixels_tab[1] = ff_h264_weight_8_10_sse2;
-                c->weight_h264_pixels_tab[2] = ff_h264_weight_8x16_10_sse2;
+                c->weight_h264_pixels_tab[2] = ff_h264_weight_4_10_sse2;
-                c->weight_h264_pixels_tab[3] = ff_h264_weight_8x8_10_sse2;
+
-                c->weight_h264_pixels_tab[4] = ff_h264_weight_8x4_10_sse2;
+                c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_10_sse2;
-                c->weight_h264_pixels_tab[5] = ff_h264_weight_4x8_10_sse2;
+                c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_10_sse2;
-                c->weight_h264_pixels_tab[6] = ff_h264_weight_4x4_10_sse2;
+                c->biweight_h264_pixels_tab[2] = ff_h264_biweight_4_10_sse2;
                c->weight_h264_pixels_tab[7] = ff_h264_weight_4x2_10_sse2;
                c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16x16_10_sse2;
                c->biweight_h264_pixels_tab[1] = ff_h264_biweight_16x8_10_sse2;
                c->biweight_h264_pixels_tab[2] = ff_h264_biweight_8x16_10_sse2;
                c->biweight_h264_pixels_tab[3] = ff_h264_biweight_8x8_10_sse2;
                c->biweight_h264_pixels_tab[4] = ff_h264_biweight_8x4_10_sse2;
                c->biweight_h264_pixels_tab[5] = ff_h264_biweight_4x8_10_sse2;
                c->biweight_h264_pixels_tab[6] = ff_h264_biweight_4x4_10_sse2;
                c->biweight_h264_pixels_tab[7] = ff_h264_biweight_4x2_10_sse2;
                c->h264_v_loop_filter_chroma= ff_deblock_v_chroma_10_sse2;
                c->h264_v_loop_filter_chroma_intra= ff_deblock_v_chroma_intra_10_sse2;
@ -513,23 +474,13 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, const int chrom
 #endif
            }
            if (mm_flags&AV_CPU_FLAG_SSE4) {
-                c->weight_h264_pixels_tab[0] = ff_h264_weight_16x16_10_sse4;
+                c->weight_h264_pixels_tab[0] = ff_h264_weight_16_10_sse4;
-                c->weight_h264_pixels_tab[1] = ff_h264_weight_16x8_10_sse4;
+                c->weight_h264_pixels_tab[1] = ff_h264_weight_8_10_sse4;
-                c->weight_h264_pixels_tab[2] = ff_h264_weight_8x16_10_sse4;
+                c->weight_h264_pixels_tab[2] = ff_h264_weight_4_10_sse4;
-                c->weight_h264_pixels_tab[3] = ff_h264_weight_8x8_10_sse4;
+
-                c->weight_h264_pixels_tab[4] = ff_h264_weight_8x4_10_sse4;
+                c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_10_sse4;
-                c->weight_h264_pixels_tab[5] = ff_h264_weight_4x8_10_sse4;
+                c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_10_sse4;
-                c->weight_h264_pixels_tab[6] = ff_h264_weight_4x4_10_sse4;
+                c->biweight_h264_pixels_tab[2] = ff_h264_biweight_4_10_sse4;
                c->weight_h264_pixels_tab[7] = ff_h264_weight_4x2_10_sse4;
                c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16x16_10_sse4;
                c->biweight_h264_pixels_tab[1] = ff_h264_biweight_16x8_10_sse4;
                c->biweight_h264_pixels_tab[2] = ff_h264_biweight_8x16_10_sse4;
                c->biweight_h264_pixels_tab[3] = ff_h264_biweight_8x8_10_sse4;
                c->biweight_h264_pixels_tab[4] = ff_h264_biweight_8x4_10_sse4;
                c->biweight_h264_pixels_tab[5] = ff_h264_biweight_4x8_10_sse4;
                c->biweight_h264_pixels_tab[6] = ff_h264_biweight_4x4_10_sse4;
                c->biweight_h264_pixels_tab[7] = ff_h264_biweight_4x2_10_sse4;
            }
 #if HAVE_AVX
            if (mm_flags&AV_CPU_FLAG_AVX) {