diff --git a/libavcodec/arm/h264dsp_init_arm.c b/libavcodec/arm/h264dsp_init_arm.c
index c1ca217add..1c331a495d 100644
--- a/libavcodec/arm/h264dsp_init_arm.c
+++ b/libavcodec/arm/h264dsp_init_arm.c
@@ -32,47 +32,22 @@ void ff_h264_v_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha,
 void ff_h264_h_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha,
                                        int beta, int8_t *tc0);
 
-void ff_weight_h264_pixels_16x16_neon(uint8_t *ds, int stride, int log2_den,
-                                      int weight, int offset);
-void ff_weight_h264_pixels_16x8_neon(uint8_t *ds, int stride, int log2_den,
-                                     int weight, int offset);
-void ff_weight_h264_pixels_8x16_neon(uint8_t *ds, int stride, int log2_den,
-                                     int weight, int offset);
-void ff_weight_h264_pixels_8x8_neon(uint8_t *ds, int stride, int log2_den,
-                                    int weight, int offset);
-void ff_weight_h264_pixels_8x4_neon(uint8_t *ds, int stride, int log2_den,
-                                    int weight, int offset);
-void ff_weight_h264_pixels_4x8_neon(uint8_t *ds, int stride, int log2_den,
-                                    int weight, int offset);
-void ff_weight_h264_pixels_4x4_neon(uint8_t *ds, int stride, int log2_den,
-                                    int weight, int offset);
-void ff_weight_h264_pixels_4x2_neon(uint8_t *ds, int stride, int log2_den,
-                                    int weight, int offset);
+void ff_weight_h264_pixels_16_neon(uint8_t *dst, int stride, int height,
+                                   int log2_den, int weight, int offset);
+void ff_weight_h264_pixels_8_neon(uint8_t *dst, int stride, int height,
+                                  int log2_den, int weight, int offset);
+void ff_weight_h264_pixels_4_neon(uint8_t *dst, int stride, int height,
+                                  int log2_den, int weight, int offset);
 
-void ff_biweight_h264_pixels_16x16_neon(uint8_t *dst, uint8_t *src, int stride,
-                                        int log2_den, int weightd, int weights,
-                                        int offset);
-void ff_biweight_h264_pixels_16x8_neon(uint8_t *dst, uint8_t *src, int stride,
-                                       int log2_den, int weightd, int weights,
-                                       int offset);
-void ff_biweight_h264_pixels_8x16_neon(uint8_t *dst, uint8_t *src, int stride,
-                                       int log2_den, int weightd, int weights,
-                                       int offset);
-void ff_biweight_h264_pixels_8x8_neon(uint8_t *dst, uint8_t *src, int stride,
-                                      int log2_den, int weightd, int weights,
-                                      int offset);
-void ff_biweight_h264_pixels_8x4_neon(uint8_t *dst, uint8_t *src, int stride,
-                                      int log2_den, int weightd, int weights,
-                                      int offset);
-void ff_biweight_h264_pixels_4x8_neon(uint8_t *dst, uint8_t *src, int stride,
-                                      int log2_den, int weightd, int weights,
-                                      int offset);
-void ff_biweight_h264_pixels_4x4_neon(uint8_t *dst, uint8_t *src, int stride,
-                                      int log2_den, int weightd, int weights,
-                                      int offset);
-void ff_biweight_h264_pixels_4x2_neon(uint8_t *dst, uint8_t *src, int stride,
-                                      int log2_den, int weightd, int weights,
-                                      int offset);
+void ff_biweight_h264_pixels_16_neon(uint8_t *dst, uint8_t *src, int stride,
+                                     int height, int log2_den, int weightd,
+                                     int weights, int offset);
+void ff_biweight_h264_pixels_8_neon(uint8_t *dst, uint8_t *src, int stride,
+                                    int height, int log2_den, int weightd,
+                                    int weights, int offset);
+void ff_biweight_h264_pixels_4_neon(uint8_t *dst, uint8_t *src, int stride,
+                                    int height, int log2_den, int weightd,
+                                    int weights, int offset);
 
 void ff_h264_idct_add_neon(uint8_t *dst, DCTELEM *block, int stride);
 void ff_h264_idct_dc_add_neon(uint8_t *dst, DCTELEM *block, int stride);
@@ -100,23 +75,13 @@ static void ff_h264dsp_init_neon(H264DSPContext *c, const int bit_depth, const i
     c->h264_v_loop_filter_chroma = ff_h264_v_loop_filter_chroma_neon;
     c->h264_h_loop_filter_chroma = ff_h264_h_loop_filter_chroma_neon;
 
-    c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels_16x16_neon;
-    c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels_16x8_neon;
-    c->weight_h264_pixels_tab[2] = ff_weight_h264_pixels_8x16_neon;
-    c->weight_h264_pixels_tab[3] = ff_weight_h264_pixels_8x8_neon;
-    c->weight_h264_pixels_tab[4] = ff_weight_h264_pixels_8x4_neon;
-    c->weight_h264_pixels_tab[5] = ff_weight_h264_pixels_4x8_neon;
-    c->weight_h264_pixels_tab[6] = ff_weight_h264_pixels_4x4_neon;
-    c->weight_h264_pixels_tab[7] = ff_weight_h264_pixels_4x2_neon;
+    c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels_16_neon;
+    c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels_8_neon;
+    c->weight_h264_pixels_tab[2] = ff_weight_h264_pixels_4_neon;
 
-    c->biweight_h264_pixels_tab[0] = ff_biweight_h264_pixels_16x16_neon;
-    c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels_16x8_neon;
-    c->biweight_h264_pixels_tab[2] = ff_biweight_h264_pixels_8x16_neon;
-    c->biweight_h264_pixels_tab[3] = ff_biweight_h264_pixels_8x8_neon;
-    c->biweight_h264_pixels_tab[4] = ff_biweight_h264_pixels_8x4_neon;
-    c->biweight_h264_pixels_tab[5] = ff_biweight_h264_pixels_4x8_neon;
-    c->biweight_h264_pixels_tab[6] = ff_biweight_h264_pixels_4x4_neon;
-    c->biweight_h264_pixels_tab[7] = ff_biweight_h264_pixels_4x2_neon;
+    c->biweight_h264_pixels_tab[0] = ff_biweight_h264_pixels_16_neon;
+    c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels_8_neon;
+    c->biweight_h264_pixels_tab[2] = ff_biweight_h264_pixels_4_neon;
 
     c->h264_idct_add        = ff_h264_idct_add_neon;
     c->h264_idct_dc_add     = ff_h264_idct_dc_add_neon;
diff --git a/libavcodec/arm/h264dsp_neon.S b/libavcodec/arm/h264dsp_neon.S
index 0fa4a6b0a5..3d2c6746ae 100644
--- a/libavcodec/arm/h264dsp_neon.S
+++ b/libavcodec/arm/h264dsp_neon.S
@@ -1592,7 +1592,7 @@ endfunc
         vdup.8          d1,  r5
         vmov            q2,  q8
         vmov            q3,  q8
-1:      subs            ip,  ip,  #2
+1:      subs            r3,  r3,  #2
         vld1.8          {d20-d21},[r0,:128], r2
         \macd           q2,  d0,  d20
         pld             [r0]
@@ -1632,7 +1632,7 @@ endfunc
         vdup.8          d1,  r5
         vmov            q1,  q8
         vmov            q10, q8
-1:      subs            ip,  ip,  #2
+1:      subs            r3,  r3,  #2
         vld1.8          {d4},[r0,:64], r2
         \macd           q1,  d0,  d4
         pld             [r0]
@@ -1662,7 +1662,7 @@ endfunc
         vdup.8          d1,  r5
         vmov            q1,  q8
         vmov            q10, q8
-1:      subs            ip,  ip,  #4
+1:      subs            r3,  r3,  #4
         vld1.32         {d4[0]},[r0,:32], r2
         vld1.32         {d4[1]},[r0,:32], r2
         \macd           q1,  d0,  d4
@@ -1700,16 +1700,17 @@ endfunc
         .endm
 
         .macro  biweight_func w
-function biweight_h264_pixels_\w\()_neon
+function ff_biweight_h264_pixels_\w\()_neon, export=1
         push            {r4-r6, lr}
-        add             r4,  sp,  #16
+        ldr             r12, [sp, #16]
+        add             r4,  sp,  #20
         ldm             r4,  {r4-r6}
         lsr             lr,  r4,  #31
         add             r6,  r6,  #1
         eors            lr,  lr,  r5,  lsr #30
         orr             r6,  r6,  #1
-        vdup.16         q9,  r3
-        lsl             r6,  r6,  r3
+        vdup.16         q9,  r12
+        lsl             r6,  r6,  r12
         vmvn            q9,  q9
         vdup.16         q8,  r6
         mov             r6,  r0
@@ -1730,34 +1731,15 @@ function biweight_h264_pixels_\w\()_neon
 endfunc
         .endm
 
-        .macro  biweight_entry w, h, b=1
-function ff_biweight_h264_pixels_\w\()x\h\()_neon, export=1
-        mov             ip,  #\h
-.if \b
-        b               biweight_h264_pixels_\w\()_neon
-.endif
-endfunc
-        .endm
-
-        biweight_entry  16, 8
-        biweight_entry  16, 16, b=0
         biweight_func   16
-
-        biweight_entry  8,  16
-        biweight_entry  8,  4
-        biweight_entry  8,  8,  b=0
         biweight_func   8
-
-        biweight_entry  4,  8
-        biweight_entry  4,  2
-        biweight_entry  4,  4,  b=0
         biweight_func   4
 
 @ Weighted prediction
 
         .macro  weight_16 add
-        vdup.8          d0,  r3
-1:      subs            ip,  ip,  #2
+        vdup.8          d0,  r12
+1:      subs            r2,  r2,  #2
         vld1.8          {d20-d21},[r0,:128], r1
         vmull.u8        q2,  d0,  d20
         pld             [r0]
@@ -1785,8 +1767,8 @@ endfunc
         .endm
 
         .macro  weight_8 add
-        vdup.8          d0,  r3
-1:      subs            ip,  ip,  #2
+        vdup.8          d0,  r12
+1:      subs            r2,  r2,  #2
         vld1.8          {d4},[r0,:64], r1
         vmull.u8        q1,  d0,  d4
         pld             [r0]
@@ -1806,10 +1788,10 @@ endfunc
         .endm
 
         .macro  weight_4 add
-        vdup.8          d0,  r3
+        vdup.8          d0,  r12
         vmov            q1,  q8
         vmov            q10, q8
-1:      subs            ip,  ip,  #4
+1:      subs            r2,  r2,  #4
         vld1.32         {d4[0]},[r0,:32], r1
         vld1.32         {d4[1]},[r0,:32], r1
         vmull.u8        q1,  d0,  d4
@@ -1842,50 +1824,32 @@ endfunc
         .endm
 
         .macro  weight_func w
-function weight_h264_pixels_\w\()_neon
+function ff_weight_h264_pixels_\w\()_neon, export=1
         push            {r4, lr}
-        ldr             r4,  [sp, #8]
-        cmp             r2,  #1
-        lsl             r4,  r4,  r2
+        ldr             r12, [sp, #8]
+        ldr             r4,  [sp, #12]
+        cmp             r3,  #1
+        lsl             r4,  r4,  r3
         vdup.16         q8,  r4
         mov             r4,  r0
         ble             20f
-        rsb             lr,  r2,  #1
+        rsb             lr,  r3,  #1
         vdup.16         q9,  lr
-        cmp             r3,  #0
+        cmp             r12, #0
         blt             10f
         weight_\w       vhadd.s16
-10:     rsb             r3,  r3,  #0
+10:     rsb             r12, r12, #0
         weight_\w       vhsub.s16
-20:     rsb             lr,  r2,  #0
+20:     rsb             lr,  r3,  #0
         vdup.16         q9,  lr
-        cmp             r3,  #0
+        cmp             r12, #0
         blt             10f
         weight_\w       vadd.s16
-10:     rsb             r3,  r3,  #0
+10:     rsb             r12, r12, #0
         weight_\w       vsub.s16
 endfunc
         .endm
 
-        .macro  weight_entry w, h, b=1
-function ff_weight_h264_pixels_\w\()x\h\()_neon, export=1
-        mov             ip,  #\h
-.if \b
-        b               weight_h264_pixels_\w\()_neon
-.endif
-endfunc
-        .endm
-
-        weight_entry    16, 8
-        weight_entry    16, 16, b=0
         weight_func     16
-
-        weight_entry    8,  16
-        weight_entry    8,  4
-        weight_entry    8,  8,  b=0
         weight_func     8
-
-        weight_entry    4,  8
-        weight_entry    4,  2
-        weight_entry    4,  4,  b=0
         weight_func     4
diff --git a/libavcodec/h264.c b/libavcodec/h264.c
index 8d652f13ce..7306828197 100644
--- a/libavcodec/h264.c
+++ b/libavcodec/h264.c
@@ -438,7 +438,8 @@ static void chroma_dc_dct_c(DCTELEM *block){
 }
 #endif
 
-static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square, int chroma_height, int delta, int list,
+static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square,
+                               int height, int delta, int list,
                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
                            int src_x_offset, int src_y_offset,
                            qpel_mc_func *qpix_op, h264_chroma_mc_func chroma_op,
@@ -518,16 +519,16 @@ static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square,
         s->dsp.emulated_edge_mc(s->edge_emu_buffer, src_cb, h->mb_uvlinesize, 9, (16>>!(CHROMA422)) + 1, (mx>>3), (my>>ysh), pic_width>>1, pic_height>>!(CHROMA422));
             src_cb= s->edge_emu_buffer;
     }
-    chroma_op(dest_cb, src_cb, h->mb_uvlinesize, chroma_height << !!(CHROMA422), mx&7, (my << !!(CHROMA422)) &7);
+    chroma_op(dest_cb, src_cb, h->mb_uvlinesize, height >> !(CHROMA422), mx&7, (my << !!(CHROMA422)) &7);
 
     if(emu){
         s->dsp.emulated_edge_mc(s->edge_emu_buffer, src_cr, h->mb_uvlinesize, 9, (16>>!(CHROMA422)) + 1, (mx>>3), (my>>ysh), pic_width>>1, pic_height>>!(CHROMA422));
             src_cr= s->edge_emu_buffer;
     }
-    chroma_op(dest_cr, src_cr, h->mb_uvlinesize, chroma_height << !!(CHROMA422), mx&7, (my << !!(CHROMA422)) &7);
+    chroma_op(dest_cr, src_cr, h->mb_uvlinesize, height >> !(CHROMA422), mx&7, (my << !!(CHROMA422)) &7);
 }
 
-static inline void mc_part_std(H264Context *h, int n, int square, int chroma_height, int delta,
+static inline void mc_part_std(H264Context *h, int n, int square, int height, int delta,
                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
                            int x_offset, int y_offset,
                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
@@ -553,7 +554,7 @@ static inline void mc_part_std(H264Context *h, int n, int square, int chroma_hei
 
     if(list0){
         Picture *ref= &h->ref_list[0][ h->ref_cache[0][ scan8[n] ] ];
-        mc_dir_part(h, ref, n, square, chroma_height, delta, 0,
+        mc_dir_part(h, ref, n, square, height, delta, 0,
                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
                            qpix_op, chroma_op, pixel_shift, chroma444);
 
@@ -563,13 +564,13 @@ static inline void mc_part_std(H264Context *h, int n, int square, int chroma_hei
 
     if(list1){
         Picture *ref= &h->ref_list[1][ h->ref_cache[1][ scan8[n] ] ];
-        mc_dir_part(h, ref, n, square, chroma_height, delta, 1,
+        mc_dir_part(h, ref, n, square, height, delta, 1,
                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
                            qpix_op, chroma_op, pixel_shift, chroma444);
     }
 }
 
-static inline void mc_part_weighted(H264Context *h, int n, int square, int chroma_height, int delta,
+static inline void mc_part_weighted(H264Context *h, int n, int square, int height, int delta,
                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
                            int x_offset, int y_offset,
                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
@@ -577,17 +578,21 @@ static inline void mc_part_weighted(H264Context *h, int n, int square, int chrom
                            h264_biweight_func luma_weight_avg, h264_biweight_func chroma_weight_avg,
                            int list0, int list1, int pixel_shift, int chroma444){
     MpegEncContext * const s = &h->s;
+    int chroma_height;
 
     dest_y += (2*x_offset << pixel_shift) + 2*y_offset*h->mb_linesize;
     if(chroma444){
+        chroma_height = height;
         chroma_weight_avg = luma_weight_avg;
         chroma_weight_op = luma_weight_op;
         dest_cb += (2*x_offset << pixel_shift) + 2*y_offset*h->mb_linesize;
         dest_cr += (2*x_offset << pixel_shift) + 2*y_offset*h->mb_linesize;
     } else if (CHROMA422) {
+        chroma_height = height;
         dest_cb += (  x_offset << pixel_shift) + 2*y_offset*h->mb_uvlinesize;
         dest_cr += (  x_offset << pixel_shift) + 2*y_offset*h->mb_uvlinesize;
     }else{
+        chroma_height = height >> 1;
         dest_cb += (  x_offset << pixel_shift) +   y_offset*h->mb_uvlinesize;
         dest_cr += (  x_offset << pixel_shift) +   y_offset*h->mb_uvlinesize;
     }
@@ -603,78 +608,53 @@ static inline void mc_part_weighted(H264Context *h, int n, int square, int chrom
         int refn0 = h->ref_cache[0][ scan8[n] ];
         int refn1 = h->ref_cache[1][ scan8[n] ];
 
-        mc_dir_part(h, &h->ref_list[0][refn0], n, square, chroma_height, delta, 0,
+        mc_dir_part(h, &h->ref_list[0][refn0], n, square, height, delta, 0,
                     dest_y, dest_cb, dest_cr,
                     x_offset, y_offset, qpix_put, chroma_put, pixel_shift, chroma444);
-        mc_dir_part(h, &h->ref_list[1][refn1], n, square, chroma_height, delta, 1,
+        mc_dir_part(h, &h->ref_list[1][refn1], n, square, height, delta, 1,
                     tmp_y, tmp_cb, tmp_cr,
                     x_offset, y_offset, qpix_put, chroma_put, pixel_shift, chroma444);
 
         if(h->use_weight == 2){
             int weight0 = h->implicit_weight[refn0][refn1][s->mb_y&1];
             int weight1 = 64 - weight0;
-            luma_weight_avg(  dest_y,  tmp_y,  h->  mb_linesize, 5, weight0, weight1, 0);
-            chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, 5, weight0, weight1, 0);
-            chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, 5, weight0, weight1, 0);
-            if (CHROMA422) {
-                chroma_weight_avg(dest_cb + chroma_height * h->mb_uvlinesize,
-                                  tmp_cb + chroma_height * h->mb_uvlinesize,
-                                  h->mb_uvlinesize, 5, weight0, weight1, 0);
-                chroma_weight_avg(dest_cr + chroma_height * h->mb_uvlinesize,
-                                  tmp_cr + chroma_height * h->mb_uvlinesize,
-                                  h->mb_uvlinesize, 5, weight0, weight1, 0);
-            }
+            luma_weight_avg(  dest_y,  tmp_y,  h->  mb_linesize,
+                              height,        5, weight0, weight1, 0);
+            chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize,
+                              chroma_height, 5, weight0, weight1, 0);
+            chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize,
+                              chroma_height, 5, weight0, weight1, 0);
         }else{
-            luma_weight_avg(dest_y, tmp_y, h->mb_linesize, h->luma_log2_weight_denom,
+            luma_weight_avg(dest_y, tmp_y, h->mb_linesize, height, h->luma_log2_weight_denom,
                             h->luma_weight[refn0][0][0] , h->luma_weight[refn1][1][0],
                             h->luma_weight[refn0][0][1] + h->luma_weight[refn1][1][1]);
-            chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
+            chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, chroma_height, h->chroma_log2_weight_denom,
                             h->chroma_weight[refn0][0][0][0] , h->chroma_weight[refn1][1][0][0],
                             h->chroma_weight[refn0][0][0][1] + h->chroma_weight[refn1][1][0][1]);
-            chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
+            chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, chroma_height, h->chroma_log2_weight_denom,
                             h->chroma_weight[refn0][0][1][0] , h->chroma_weight[refn1][1][1][0],
                             h->chroma_weight[refn0][0][1][1] + h->chroma_weight[refn1][1][1][1]);
-            if (CHROMA422) {
-                chroma_weight_avg(dest_cb + chroma_height * h->mb_uvlinesize,
-                                  tmp_cb + chroma_height * h->mb_uvlinesize,
-                                  h->mb_uvlinesize, h->chroma_log2_weight_denom,
-                                  h->chroma_weight[refn0][0][0][0] , h->chroma_weight[refn1][1][0][0],
-                                  h->chroma_weight[refn0][0][0][1] + h->chroma_weight[refn1][1][0][1]);
-                chroma_weight_avg(dest_cr + chroma_height * h->mb_uvlinesize,
-                                  tmp_cr + chroma_height * h->mb_uvlinesize,
-                                  h->mb_uvlinesize, h->chroma_log2_weight_denom,
-                                  h->chroma_weight[refn0][0][1][0] , h->chroma_weight[refn1][1][1][0],
-                                  h->chroma_weight[refn0][0][1][1] + h->chroma_weight[refn1][1][1][1]);
-            }
         }
     }else{
         int list = list1 ? 1 : 0;
         int refn = h->ref_cache[list][ scan8[n] ];
         Picture *ref= &h->ref_list[list][refn];
-        mc_dir_part(h, ref, n, square, chroma_height, delta, list,
+        mc_dir_part(h, ref, n, square, height, delta, list,
                     dest_y, dest_cb, dest_cr, x_offset, y_offset,
                     qpix_put, chroma_put, pixel_shift, chroma444);
 
-        luma_weight_op(dest_y, h->mb_linesize, h->luma_log2_weight_denom,
+        luma_weight_op(dest_y, h->mb_linesize, height, h->luma_log2_weight_denom,
                        h->luma_weight[refn][list][0], h->luma_weight[refn][list][1]);
         if(h->use_weight_chroma){
-            chroma_weight_op(dest_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
+            chroma_weight_op(dest_cb, h->mb_uvlinesize, chroma_height, h->chroma_log2_weight_denom,
                              h->chroma_weight[refn][list][0][0], h->chroma_weight[refn][list][0][1]);
-            chroma_weight_op(dest_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
+            chroma_weight_op(dest_cr, h->mb_uvlinesize, chroma_height, h->chroma_log2_weight_denom,
                              h->chroma_weight[refn][list][1][0], h->chroma_weight[refn][list][1][1]);
-            if (CHROMA422) {
-                chroma_weight_op(dest_cb + chroma_height * h->mb_uvlinesize,
-                                 h->mb_uvlinesize, h->chroma_log2_weight_denom,
-                                 h->chroma_weight[refn][list][0][0], h->chroma_weight[refn][list][0][1]);
-                chroma_weight_op(dest_cr + chroma_height * h->mb_uvlinesize,
-                                 h->mb_uvlinesize, h->chroma_log2_weight_denom,
-                                 h->chroma_weight[refn][list][1][0], h->chroma_weight[refn][list][1][1]);
-            }
         }
     }
 }
 
-static inline void mc_part(H264Context *h, int n, int square, int chroma_height, int delta,
+static inline void mc_part(H264Context *h, int n, int square, int height, int delta,
                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
                            int x_offset, int y_offset,
                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
@@ -684,12 +664,12 @@ static inline void mc_part(H264Context *h, int n, int square, int chroma_height,
     if((h->use_weight==2 && list0 && list1
         && (h->implicit_weight[ h->ref_cache[0][scan8[n]] ][ h->ref_cache[1][scan8[n]] ][h->s.mb_y&1] != 32))
        || h->use_weight==1)
-        mc_part_weighted(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
+        mc_part_weighted(h, n, square, height, delta, dest_y, dest_cb, dest_cr,
                          x_offset, y_offset, qpix_put, chroma_put,
-                         weight_op[0], weight_op[3], weight_avg[0],
-                         weight_avg[3], list0, list1, pixel_shift, chroma444);
+                         weight_op[0], weight_op[1], weight_avg[0],
+                         weight_avg[1], list0, list1, pixel_shift, chroma444);
     else
-        mc_part_std(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
+        mc_part_std(h, n, square, height, delta, dest_y, dest_cb, dest_cr,
                     x_offset, y_offset, qpix_put, chroma_put, qpix_avg,
                     chroma_avg, list0, list1, pixel_shift, chroma444);
 }
@@ -731,31 +711,31 @@ static av_always_inline void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t
     prefetch_motion(h, 0, pixel_shift, chroma444);
 
     if(IS_16X16(mb_type)){
-        mc_part(h, 0, 1, 8, 0, dest_y, dest_cb, dest_cr, 0, 0,
+        mc_part(h, 0, 1, 16, 0, dest_y, dest_cb, dest_cr, 0, 0,
                 qpix_put[0], chroma_put[0], qpix_avg[0], chroma_avg[0],
                 weight_op, weight_avg,
                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1),
                 pixel_shift, chroma444);
     }else if(IS_16X8(mb_type)){
-        mc_part(h, 0, 0, 4, 8 << pixel_shift, dest_y, dest_cb, dest_cr, 0, 0,
+        mc_part(h, 0, 0, 8, 8 << pixel_shift, dest_y, dest_cb, dest_cr, 0, 0,
                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
-                &weight_op[1], &weight_avg[1],
+                weight_op, weight_avg,
                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1),
                 pixel_shift, chroma444);
-        mc_part(h, 8, 0, 4, 8 << pixel_shift, dest_y, dest_cb, dest_cr, 0, 4,
+        mc_part(h, 8, 0, 8, 8 << pixel_shift, dest_y, dest_cb, dest_cr, 0, 4,
                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
-                &weight_op[1], &weight_avg[1],
+                weight_op, weight_avg,
                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1),
                 pixel_shift, chroma444);
     }else if(IS_8X16(mb_type)){
-        mc_part(h, 0, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 0, 0,
+        mc_part(h, 0, 0, 16, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 0, 0,
                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
-                &weight_op[2], &weight_avg[2],
+                &weight_op[1], &weight_avg[1],
                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1),
                 pixel_shift, chroma444);
-        mc_part(h, 4, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 4, 0,
+        mc_part(h, 4, 0, 16, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 4, 0,
                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
-                &weight_op[2], &weight_avg[2],
+                &weight_op[1], &weight_avg[1],
                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1),
                 pixel_shift, chroma444);
     }else{
@@ -770,31 +750,31 @@ static av_always_inline void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t
             int y_offset= (i&2)<<1;
 
             if(IS_SUB_8X8(sub_mb_type)){
-                mc_part(h, n, 1, 4, 0, dest_y, dest_cb, dest_cr, x_offset, y_offset,
+                mc_part(h, n, 1, 8, 0, dest_y, dest_cb, dest_cr, x_offset, y_offset,
                     qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
-                    &weight_op[3], &weight_avg[3],
+                    &weight_op[1], &weight_avg[1],
                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1),
                     pixel_shift, chroma444);
             }else if(IS_SUB_8X4(sub_mb_type)){
-                mc_part(h, n  , 0, 2, 4 << pixel_shift, dest_y, dest_cb, dest_cr, x_offset, y_offset,
+                mc_part(h, n  , 0, 4, 4 << pixel_shift, dest_y, dest_cb, dest_cr, x_offset, y_offset,
                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
-                    &weight_op[4], &weight_avg[4],
+                    &weight_op[1], &weight_avg[1],
                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1),
                     pixel_shift, chroma444);
-                mc_part(h, n+2, 0, 2, 4 << pixel_shift, dest_y, dest_cb, dest_cr, x_offset, y_offset+2,
+                mc_part(h, n+2, 0, 4, 4 << pixel_shift, dest_y, dest_cb, dest_cr, x_offset, y_offset+2,
                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
-                    &weight_op[4], &weight_avg[4],
+                    &weight_op[1], &weight_avg[1],
                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1),
                     pixel_shift, chroma444);
             }else if(IS_SUB_4X8(sub_mb_type)){
-                mc_part(h, n  , 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset, y_offset,
+                mc_part(h, n  , 0, 8, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset, y_offset,
                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
-                    &weight_op[5], &weight_avg[5],
+                    &weight_op[2], &weight_avg[2],
                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1),
                     pixel_shift, chroma444);
-                mc_part(h, n+1, 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset+2, y_offset,
+                mc_part(h, n+1, 0, 8, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset+2, y_offset,
                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
-                    &weight_op[5], &weight_avg[5],
+                    &weight_op[2], &weight_avg[2],
                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1),
                     pixel_shift, chroma444);
             }else{
@@ -803,9 +783,9 @@ static av_always_inline void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t
                 for(j=0; j<4; j++){
                     int sub_x_offset= x_offset + 2*(j&1);
                     int sub_y_offset= y_offset +   (j&2);
-                    mc_part(h, n+j, 1, 2, 0, dest_y, dest_cb, dest_cr, sub_x_offset, sub_y_offset,
+                    mc_part(h, n+j, 1, 4, 0, dest_y, dest_cb, dest_cr, sub_x_offset, sub_y_offset,
                         qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
-                        &weight_op[6], &weight_avg[6],
+                        &weight_op[2], &weight_avg[2],
                         IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1),
                         pixel_shift, chroma444);
                 }
diff --git a/libavcodec/h264dsp.c b/libavcodec/h264dsp.c
index 19ad2db3d9..ba967079fb 100644
--- a/libavcodec/h264dsp.c
+++ b/libavcodec/h264dsp.c
@@ -64,26 +64,14 @@ void ff_h264dsp_init(H264DSPContext *c, const int bit_depth, const int chroma_fo
     else\
         c->h264_chroma_dc_dequant_idct= FUNC(ff_h264_chroma422_dc_dequant_idct, depth);\
 \
-    c->weight_h264_pixels_tab[0]= FUNC(weight_h264_pixels16x16, depth);\
-    c->weight_h264_pixels_tab[1]= FUNC(weight_h264_pixels16x8, depth);\
-    c->weight_h264_pixels_tab[2]= FUNC(weight_h264_pixels8x16, depth);\
-    c->weight_h264_pixels_tab[3]= FUNC(weight_h264_pixels8x8, depth);\
-    c->weight_h264_pixels_tab[4]= FUNC(weight_h264_pixels8x4, depth);\
-    c->weight_h264_pixels_tab[5]= FUNC(weight_h264_pixels4x8, depth);\
-    c->weight_h264_pixels_tab[6]= FUNC(weight_h264_pixels4x4, depth);\
-    c->weight_h264_pixels_tab[7]= FUNC(weight_h264_pixels4x2, depth);\
-    c->weight_h264_pixels_tab[8]= FUNC(weight_h264_pixels2x4, depth);\
-    c->weight_h264_pixels_tab[9]= FUNC(weight_h264_pixels2x2, depth);\
-    c->biweight_h264_pixels_tab[0]= FUNC(biweight_h264_pixels16x16, depth);\
-    c->biweight_h264_pixels_tab[1]= FUNC(biweight_h264_pixels16x8, depth);\
-    c->biweight_h264_pixels_tab[2]= FUNC(biweight_h264_pixels8x16, depth);\
-    c->biweight_h264_pixels_tab[3]= FUNC(biweight_h264_pixels8x8, depth);\
-    c->biweight_h264_pixels_tab[4]= FUNC(biweight_h264_pixels8x4, depth);\
-    c->biweight_h264_pixels_tab[5]= FUNC(biweight_h264_pixels4x8, depth);\
-    c->biweight_h264_pixels_tab[6]= FUNC(biweight_h264_pixels4x4, depth);\
-    c->biweight_h264_pixels_tab[7]= FUNC(biweight_h264_pixels4x2, depth);\
-    c->biweight_h264_pixels_tab[8]= FUNC(biweight_h264_pixels2x4, depth);\
-    c->biweight_h264_pixels_tab[9]= FUNC(biweight_h264_pixels2x2, depth);\
+    c->weight_h264_pixels_tab[0]= FUNC(weight_h264_pixels16, depth);\
+    c->weight_h264_pixels_tab[1]= FUNC(weight_h264_pixels8, depth);\
+    c->weight_h264_pixels_tab[2]= FUNC(weight_h264_pixels4, depth);\
+    c->weight_h264_pixels_tab[3]= FUNC(weight_h264_pixels2, depth);\
+    c->biweight_h264_pixels_tab[0]= FUNC(biweight_h264_pixels16, depth);\
+    c->biweight_h264_pixels_tab[1]= FUNC(biweight_h264_pixels8, depth);\
+    c->biweight_h264_pixels_tab[2]= FUNC(biweight_h264_pixels4, depth);\
+    c->biweight_h264_pixels_tab[3]= FUNC(biweight_h264_pixels2, depth);\
 \
     c->h264_v_loop_filter_luma= FUNC(h264_v_loop_filter_luma, depth);\
     c->h264_h_loop_filter_luma= FUNC(h264_h_loop_filter_luma, depth);\
diff --git a/libavcodec/h264dsp.h b/libavcodec/h264dsp.h
index 7337f178e9..7cae215a95 100644
--- a/libavcodec/h264dsp.h
+++ b/libavcodec/h264dsp.h
@@ -31,16 +31,18 @@
 #include "dsputil.h"
 
 //typedef void (*h264_chroma_mc_func)(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int srcStride, int h, int x, int y);
-typedef void (*h264_weight_func)(uint8_t *block, int stride, int log2_denom, int weight, int offset);
-typedef void (*h264_biweight_func)(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset);
+typedef void (*h264_weight_func)(uint8_t *block, int stride, int height,
+                                 int log2_denom, int weight, int offset);
+typedef void (*h264_biweight_func)(uint8_t *dst, uint8_t *src, int stride, int height,
+                                   int log2_denom, int weightd, int weights, int offset);
 
 /**
  * Context for storing H.264 DSP functions
  */
 typedef struct H264DSPContext{
     /* weighted MC */
-    h264_weight_func weight_h264_pixels_tab[10];
-    h264_biweight_func biweight_h264_pixels_tab[10];
+    h264_weight_func weight_h264_pixels_tab[4];
+    h264_biweight_func biweight_h264_pixels_tab[4];
 
     /* loop filter */
     void (*h264_v_loop_filter_luma)(uint8_t *pix/*align 16*/, int stride, int alpha, int beta, int8_t *tc0);
diff --git a/libavcodec/h264dsp_template.c b/libavcodec/h264dsp_template.c
index ee4bbe51dc..3d99cfcfec 100644
--- a/libavcodec/h264dsp_template.c
+++ b/libavcodec/h264dsp_template.c
@@ -29,14 +29,16 @@
 
 #define op_scale1(x)  block[x] = av_clip_pixel( (block[x]*weight + offset) >> log2_denom )
 #define op_scale2(x)  dst[x] = av_clip_pixel( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
-#define H264_WEIGHT(W,H) \
-static void FUNCC(weight_h264_pixels ## W ## x ## H)(uint8_t *_block, int stride, int log2_denom, int weight, int offset){ \
+#define H264_WEIGHT(W) \
+static void FUNCC(weight_h264_pixels ## W)(uint8_t *_block, int stride, int height, \
+                                           int log2_denom, int weight, int offset) \
+{ \
     int y; \
     pixel *block = (pixel*)_block; \
     stride /= sizeof(pixel); \
     offset <<= (log2_denom + (BIT_DEPTH-8)); \
     if(log2_denom) offset += 1<<(log2_denom-1); \
-    for(y=0; y<H; y++, block += stride){ \
+    for (y = 0; y < height; y++, block += stride) { \
         op_scale1(0); \
         op_scale1(1); \
         if(W==2) continue; \
@@ -58,14 +60,16 @@ static void FUNCC(weight_h264_pixels ## W ## x ## H)(uint8_t *_block, int stride
         op_scale1(15); \
     } \
 } \
-static void FUNCC(biweight_h264_pixels ## W ## x ## H)(uint8_t *_dst, uint8_t *_src, int stride, int log2_denom, int weightd, int weights, int offset){ \
+static void FUNCC(biweight_h264_pixels ## W)(uint8_t *_dst, uint8_t *_src, int stride, int height, \
+                                             int log2_denom, int weightd, int weights, int offset) \
+{ \
     int y; \
     pixel *dst = (pixel*)_dst; \
     pixel *src = (pixel*)_src; \
     stride /= sizeof(pixel); \
     offset <<= (BIT_DEPTH-8); \
     offset = ((offset + 1) | 1) << log2_denom; \
-    for(y=0; y<H; y++, dst += stride, src += stride){ \
+    for (y = 0; y < height; y++, dst += stride, src += stride) { \
         op_scale2(0); \
         op_scale2(1); \
         if(W==2) continue; \
@@ -88,16 +92,10 @@ static void FUNCC(biweight_h264_pixels ## W ## x ## H)(uint8_t *_dst, uint8_t *_
     } \
 }
 
-H264_WEIGHT(16,16)
-H264_WEIGHT(16,8)
-H264_WEIGHT(8,16)
-H264_WEIGHT(8,8)
-H264_WEIGHT(8,4)
-H264_WEIGHT(4,8)
-H264_WEIGHT(4,4)
-H264_WEIGHT(4,2)
-H264_WEIGHT(2,4)
-H264_WEIGHT(2,2)
+H264_WEIGHT(16)
+H264_WEIGHT(8)
+H264_WEIGHT(4)
+H264_WEIGHT(2)
 
 #undef op_scale1
 #undef op_scale2
diff --git a/libavcodec/ppc/h264_altivec.c b/libavcodec/ppc/h264_altivec.c
index a9153788de..edc043c3c7 100644
--- a/libavcodec/ppc/h264_altivec.c
+++ b/libavcodec/ppc/h264_altivec.c
@@ -843,7 +843,8 @@ static void h264_h_loop_filter_luma_altivec(uint8_t *pix, int stride, int alpha,
 }
 
 static av_always_inline
-void weight_h264_WxH_altivec(uint8_t *block, int stride, int log2_denom, int weight, int offset, int w, int h)
+void weight_h264_W_altivec(uint8_t *block, int stride, int height,
+                           int log2_denom, int weight, int offset, int w)
 {
     int y, aligned;
     vec_u8 vblock;
@@ -864,7 +865,7 @@ void weight_h264_WxH_altivec(uint8_t *block, int stride, int log2_denom, int wei
     voffset = vec_splat(vtemp, 5);
     aligned = !((unsigned long)block & 0xf);
 
-    for (y=0; y<h; y++) {
+    for (y = 0; y < height; y++) {
         vblock = vec_ld(0, block);
 
         v0 = (vec_s16)vec_mergeh(zero_u8v, vblock);
@@ -888,8 +889,8 @@ void weight_h264_WxH_altivec(uint8_t *block, int stride, int log2_denom, int wei
 }
 
 static av_always_inline
-void biweight_h264_WxH_altivec(uint8_t *dst, uint8_t *src, int stride, int log2_denom,
-                               int weightd, int weights, int offset, int w, int h)
+void biweight_h264_W_altivec(uint8_t *dst, uint8_t *src, int stride, int height,
+                             int log2_denom, int weightd, int weights, int offset, int w)
 {
     int y, dst_aligned, src_aligned;
     vec_u8 vsrc, vdst;
@@ -912,7 +913,7 @@ void biweight_h264_WxH_altivec(uint8_t *dst, uint8_t *src, int stride, int log2_
     dst_aligned = !((unsigned long)dst & 0xf);
     src_aligned = !((unsigned long)src & 0xf);
 
-    for (y=0; y<h; y++) {
+    for (y = 0; y < height; y++) {
         vdst = vec_ld(0, dst);
         vsrc = vec_ld(0, src);
 
@@ -952,19 +953,18 @@ void biweight_h264_WxH_altivec(uint8_t *dst, uint8_t *src, int stride, int log2_
     }
 }
 
-#define H264_WEIGHT(W,H) \
-static void ff_weight_h264_pixels ## W ## x ## H ## _altivec(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
-    weight_h264_WxH_altivec(block, stride, log2_denom, weight, offset, W, H); \
+#define H264_WEIGHT(W) \
+static void ff_weight_h264_pixels ## W ## _altivec(uint8_t *block, int stride, int height, \
+                                                   int log2_denom, int weight, int offset){ \
+    weight_h264_WxH_altivec(block, stride, height, log2_denom, weight, offset, W); \
 }\
-static void ff_biweight_h264_pixels ## W ## x ## H ## _altivec(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
-    biweight_h264_WxH_altivec(dst, src, stride, log2_denom, weightd, weights, offset, W, H); \
+static void ff_biweight_h264_pixels ## W ## _altivec(uint8_t *dst, uint8_t *src, int stride, int height, \
+                                                     int log2_denom, int weightd, int weights, int offset){ \
+    biweight_h264_WxH_altivec(dst, src, stride, height, log2_denom, weightd, weights, offset, W); \
 }
 
-H264_WEIGHT(16,16)
-H264_WEIGHT(16, 8)
-H264_WEIGHT( 8,16)
-H264_WEIGHT( 8, 8)
-H264_WEIGHT( 8, 4)
+H264_WEIGHT(16)
+H264_WEIGHT( 8)
 
 void dsputil_h264_init_ppc(DSPContext* c, AVCodecContext *avctx) {
     const int high_bit_depth = avctx->bits_per_raw_sample > 8;
@@ -1015,16 +1015,10 @@ void ff_h264dsp_init_ppc(H264DSPContext *c, const int bit_depth, const int chrom
         c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_altivec;
         c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_altivec;
 
-        c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels16x16_altivec;
-        c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels16x8_altivec;
-        c->weight_h264_pixels_tab[2] = ff_weight_h264_pixels8x16_altivec;
-        c->weight_h264_pixels_tab[3] = ff_weight_h264_pixels8x8_altivec;
-        c->weight_h264_pixels_tab[4] = ff_weight_h264_pixels8x4_altivec;
-        c->biweight_h264_pixels_tab[0] = ff_biweight_h264_pixels16x16_altivec;
-        c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels16x8_altivec;
-        c->biweight_h264_pixels_tab[2] = ff_biweight_h264_pixels8x16_altivec;
-        c->biweight_h264_pixels_tab[3] = ff_biweight_h264_pixels8x8_altivec;
-        c->biweight_h264_pixels_tab[4] = ff_biweight_h264_pixels8x4_altivec;
+        c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels16_altivec;
+        c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels8_altivec;
+        c->biweight_h264_pixels_tab[0] = ff_biweight_h264_pixels16_altivec;
+        c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels8_altivec;
     }
     }
 }
diff --git a/libavcodec/x86/h264_weight.asm b/libavcodec/x86/h264_weight.asm
index d80ca32583..bc8bfd686e 100644
--- a/libavcodec/x86/h264_weight.asm
+++ b/libavcodec/x86/h264_weight.asm
@@ -28,21 +28,20 @@ SECTION .text
 ;-----------------------------------------------------------------------------
 ; biweight pred:
 ;
-; void h264_biweight_16x16_sse2(uint8_t *dst, uint8_t *src, int stride,
-;                               int log2_denom, int weightd, int weights,
-;                               int offset);
+; void h264_biweight_16_sse2(uint8_t *dst, uint8_t *src, int stride,
+;                            int height, int log2_denom, int weightd,
+;                            int weights, int offset);
 ; and
-; void h264_weight_16x16_sse2(uint8_t *dst, int stride,
-;                             int log2_denom, int weight,
-;                             int offset);
+; void h264_weight_16_sse2(uint8_t *dst, int stride, int height,
+;                          int log2_denom, int weight, int offset);
 ;-----------------------------------------------------------------------------
 
 %macro WEIGHT_SETUP 0
-    add        r4, r4
-    inc        r4
-    movd       m3, r3d
-    movd       m5, r4d
-    movd       m6, r2d
+    add        r5, r5
+    inc        r5
+    movd       m3, r4d
+    movd       m5, r5d
+    movd       m6, r3d
     pslld      m5, m6
     psrld      m5, 1
 %if mmsize == 16
@@ -71,60 +70,41 @@ SECTION .text
     packuswb      m0, m1
 %endmacro
 
-%macro WEIGHT_FUNC_DBL_MM 1
-cglobal h264_weight_16x%1_mmx2, 5, 5, 0
+INIT_MMX
+cglobal h264_weight_16_mmx2, 6, 6, 0
     WEIGHT_SETUP
-    mov        r2, %1
-%if %1 == 16
 .nextrow
     WEIGHT_OP 0,  4
     mova     [r0  ], m0
     WEIGHT_OP 8, 12
     mova     [r0+8], m0
     add        r0, r1
-    dec        r2
+    dec        r2d
     jnz .nextrow
     REP_RET
-%else
-    jmp mangle(ff_h264_weight_16x16_mmx2.nextrow)
-%endif
-%endmacro
 
-INIT_MMX
-WEIGHT_FUNC_DBL_MM 16
-WEIGHT_FUNC_DBL_MM  8
-
-%macro WEIGHT_FUNC_MM 4
-cglobal h264_weight_%1x%2_%4, 7, 7, %3
+%macro WEIGHT_FUNC_MM 3
+cglobal h264_weight_%1_%3, 6, 6, %2
     WEIGHT_SETUP
-    mov        r2, %2
-%if %2 == 16
 .nextrow
     WEIGHT_OP 0, mmsize/2
     mova     [r0], m0
     add        r0, r1
-    dec        r2
+    dec        r2d
     jnz .nextrow
     REP_RET
-%else
-    jmp mangle(ff_h264_weight_%1x16_%4.nextrow)
-%endif
 %endmacro
 
 INIT_MMX
-WEIGHT_FUNC_MM  8, 16,  0, mmx2
-WEIGHT_FUNC_MM  8,  8,  0, mmx2
-WEIGHT_FUNC_MM  8,  4,  0, mmx2
+WEIGHT_FUNC_MM  8, 0, mmx2
 INIT_XMM
-WEIGHT_FUNC_MM 16, 16,  8, sse2
-WEIGHT_FUNC_MM 16,  8,  8, sse2
+WEIGHT_FUNC_MM 16, 8, sse2
 
-%macro WEIGHT_FUNC_HALF_MM 5
-cglobal h264_weight_%1x%2_%5, 5, 5, %4
+%macro WEIGHT_FUNC_HALF_MM 3
+cglobal h264_weight_%1_%3, 6, 6, %2
     WEIGHT_SETUP
-    mov        r2, %2/2
+    sar       r2d, 1
     lea        r3, [r1*2]
-%if %2 == mmsize
 .nextrow
     WEIGHT_OP 0, r1
     movh     [r0], m0
@@ -135,31 +115,34 @@ cglobal h264_weight_%1x%2_%5, 5, 5, %4
     movh     [r0+r1], m0
 %endif
     add        r0, r3
-    dec        r2
+    dec        r2d
     jnz .nextrow
     REP_RET
-%else
-    jmp mangle(ff_h264_weight_%1x%3_%5.nextrow)
-%endif
 %endmacro
 
 INIT_MMX
-WEIGHT_FUNC_HALF_MM 4,  8,  8, 0, mmx2
-WEIGHT_FUNC_HALF_MM 4,  4,  8, 0, mmx2
-WEIGHT_FUNC_HALF_MM 4,  2,  8, 0, mmx2
+WEIGHT_FUNC_HALF_MM 4, 0, mmx2
+WEIGHT_FUNC_HALF_MM 4, 0, mmx2
+WEIGHT_FUNC_HALF_MM 4, 0, mmx2
 INIT_XMM
-WEIGHT_FUNC_HALF_MM 8, 16, 16, 8, sse2
-WEIGHT_FUNC_HALF_MM 8,  8, 16, 8, sse2
-WEIGHT_FUNC_HALF_MM 8,  4, 16, 8, sse2
+WEIGHT_FUNC_HALF_MM 8, 8, sse2
+WEIGHT_FUNC_HALF_MM 8, 8, sse2
+WEIGHT_FUNC_HALF_MM 8, 8, sse2
 
 %macro BIWEIGHT_SETUP 0
-    add        r6, 1
-    or         r6, 1
-    add        r3, 1
-    movd       m3, r4d
-    movd       m4, r5d
-    movd       m5, r6d
-    movd       m6, r3d
+%ifdef ARCH_X86_64
+%define off_regd r11d
+%else
+%define off_regd r3d
+%endif
+    mov  off_regd, r7m
+    add  off_regd, 1
+    or   off_regd, 1
+    add        r4, 1
+    movd       m3, r5d
+    movd       m4, r6d
+    movd       m5, off_regd
+    movd       m6, r4d
     pslld      m5, m6
     psrld      m5, 1
 %if mmsize == 16
@@ -195,11 +178,10 @@ WEIGHT_FUNC_HALF_MM 8,  4, 16, 8, sse2
     packuswb   m0, m1
 %endmacro
 
-%macro BIWEIGHT_FUNC_DBL_MM 1
-cglobal h264_biweight_16x%1_mmx2, 7, 7, 0
+INIT_MMX
+cglobal h264_biweight_16_mmx2, 7, 7, 0
     BIWEIGHT_SETUP
-    mov        r3, %1
-%if %1 == 16
+    movifnidn r3d, r3m
 .nextrow
     BIWEIGHT_STEPA 0, 1, 0
     BIWEIGHT_STEPA 1, 2, 4
@@ -211,23 +193,14 @@ cglobal h264_biweight_16x%1_mmx2, 7, 7, 0
     mova     [r0+8], m0
     add        r0, r2
     add        r1, r2
-    dec        r3
+    dec        r3d
     jnz .nextrow
     REP_RET
-%else
-    jmp mangle(ff_h264_biweight_16x16_mmx2.nextrow)
-%endif
-%endmacro
 
-INIT_MMX
-BIWEIGHT_FUNC_DBL_MM 16
-BIWEIGHT_FUNC_DBL_MM  8
-
-%macro BIWEIGHT_FUNC_MM 4
-cglobal h264_biweight_%1x%2_%4, 7, 7, %3
+%macro BIWEIGHT_FUNC_MM 3
+cglobal h264_biweight_%1_%3, 7, 7, %2
     BIWEIGHT_SETUP
-    mov        r3, %2
-%if %2 == 16
+    movifnidn r3d, r3m
 .nextrow
     BIWEIGHT_STEPA 0, 1, 0
     BIWEIGHT_STEPA 1, 2, mmsize/2
@@ -235,28 +208,22 @@ cglobal h264_biweight_%1x%2_%4, 7, 7, %3
     mova       [r0], m0
     add        r0, r2
     add        r1, r2
-    dec        r3
+    dec        r3d
     jnz .nextrow
     REP_RET
-%else
-    jmp mangle(ff_h264_biweight_%1x16_%4.nextrow)
-%endif
 %endmacro
 
 INIT_MMX
-BIWEIGHT_FUNC_MM  8, 16,  0, mmx2
-BIWEIGHT_FUNC_MM  8,  8,  0, mmx2
-BIWEIGHT_FUNC_MM  8,  4,  0, mmx2
+BIWEIGHT_FUNC_MM  8, 0, mmx2
 INIT_XMM
-BIWEIGHT_FUNC_MM 16, 16,  8, sse2
-BIWEIGHT_FUNC_MM 16,  8,  8, sse2
+BIWEIGHT_FUNC_MM 16, 8, sse2
 
-%macro BIWEIGHT_FUNC_HALF_MM 5
-cglobal h264_biweight_%1x%2_%5, 7, 7, %4
+%macro BIWEIGHT_FUNC_HALF_MM 3
+cglobal h264_biweight_%1_%3, 7, 7, %2
     BIWEIGHT_SETUP
-    mov        r3, %2/2
+    movifnidn r3d, r3m
+    sar        r3, 1
     lea        r4, [r2*2]
-%if %2 == mmsize
 .nextrow
     BIWEIGHT_STEPA 0, 1, 0
     BIWEIGHT_STEPA 1, 2, r2
@@ -270,31 +237,30 @@ cglobal h264_biweight_%1x%2_%5, 7, 7, %4
 %endif
     add        r0, r4
     add        r1, r4
-    dec        r3
+    dec        r3d
     jnz .nextrow
     REP_RET
-%else
-    jmp mangle(ff_h264_biweight_%1x%3_%5.nextrow)
-%endif
 %endmacro
 
 INIT_MMX
-BIWEIGHT_FUNC_HALF_MM 4,  8,  8, 0, mmx2
-BIWEIGHT_FUNC_HALF_MM 4,  4,  8, 0, mmx2
-BIWEIGHT_FUNC_HALF_MM 4,  2,  8, 0, mmx2
+BIWEIGHT_FUNC_HALF_MM 4, 0, mmx2
 INIT_XMM
-BIWEIGHT_FUNC_HALF_MM 8, 16, 16, 8, sse2
-BIWEIGHT_FUNC_HALF_MM 8,  8, 16, 8, sse2
-BIWEIGHT_FUNC_HALF_MM 8,  4, 16, 8, sse2
+BIWEIGHT_FUNC_HALF_MM 8, 8, sse2
 
 %macro BIWEIGHT_SSSE3_SETUP 0
-    add        r6, 1
-    or         r6, 1
-    add        r3, 1
-    movd       m4, r4d
-    movd       m0, r5d
-    movd       m5, r6d
-    movd       m6, r3d
+%ifdef ARCH_X86_64
+%define off_regd r11d
+%else
+%define off_regd r3d
+%endif
+    mov  off_regd, r7m
+    add  off_regd, 1
+    or   off_regd, 1
+    add        r4, 1
+    movd       m4, r5d
+    movd       m0, r6d
+    movd       m5, off_regd
+    movd       m6, r4d
     pslld      m5, m6
     psrld      m5, 1
     punpcklbw  m4, m0
@@ -314,12 +280,11 @@ BIWEIGHT_FUNC_HALF_MM 8,  4, 16, 8, sse2
     packuswb   m0, m2
 %endmacro
 
-%macro BIWEIGHT_SSSE3_16 1
-cglobal h264_biweight_16x%1_ssse3, 7, 7, 8
+INIT_XMM
+cglobal h264_biweight_16_ssse3, 7, 7, 8
     BIWEIGHT_SSSE3_SETUP
-    mov        r3, %1
+    movifnidn r3d, r3m
 
-%if %1 == 16
 .nextrow
     movh       m0, [r0]
     movh       m2, [r0+8]
@@ -330,25 +295,17 @@ cglobal h264_biweight_16x%1_ssse3, 7, 7, 8
     mova       [r0], m0
     add        r0, r2
     add        r1, r2
-    dec        r3
+    dec        r3d
     jnz .nextrow
     REP_RET
-%else
-    jmp mangle(ff_h264_biweight_16x16_ssse3.nextrow)
-%endif
-%endmacro
 
 INIT_XMM
-BIWEIGHT_SSSE3_16 16
-BIWEIGHT_SSSE3_16  8
-
-%macro BIWEIGHT_SSSE3_8 1
-cglobal h264_biweight_8x%1_ssse3, 7, 7, 8
+cglobal h264_biweight_8_ssse3, 7, 7, 8
     BIWEIGHT_SSSE3_SETUP
-    mov        r3, %1/2
+    movifnidn r3d, r3m
+    sar        r3, 1
     lea        r4, [r2*2]
 
-%if %1 == 16
 .nextrow
     movh       m0, [r0]
     movh       m1, [r1]
@@ -361,15 +318,6 @@ cglobal h264_biweight_8x%1_ssse3, 7, 7, 8
     movhps     [r0+r2], m0
     add        r0, r4
     add        r1, r4
-    dec        r3
+    dec        r3d
     jnz .nextrow
     REP_RET
-%else
-    jmp mangle(ff_h264_biweight_8x16_ssse3.nextrow)
-%endif
-%endmacro
-
-INIT_XMM
-BIWEIGHT_SSSE3_8 16
-BIWEIGHT_SSSE3_8  8
-BIWEIGHT_SSSE3_8  4
diff --git a/libavcodec/x86/h264_weight_10bit.asm b/libavcodec/x86/h264_weight_10bit.asm
index 1c58d72d94..20df6fbab5 100644
--- a/libavcodec/x86/h264_weight_10bit.asm
+++ b/libavcodec/x86/h264_weight_10bit.asm
@@ -36,33 +36,26 @@ cextern pw_1
 SECTION .text
 
 ;-----------------------------------------------------------------------------
-; void h264_weight(uint8_t *dst, int stride, int log2_denom,
+; void h264_weight(uint8_t *dst, int stride, int height, int log2_denom,
 ;                  int weight, int offset);
 ;-----------------------------------------------------------------------------
-%ifdef ARCH_X86_32
-DECLARE_REG_TMP 2
-%else
-DECLARE_REG_TMP 10
-%endif
-
-%macro WEIGHT_PROLOGUE 1
-    mov t0, %1
+%macro WEIGHT_PROLOGUE 0
 .prologue
-    PROLOGUE 0,5,8
+    PROLOGUE 0,6,8
     movifnidn  r0, r0mp
     movifnidn r1d, r1m
-    movifnidn r3d, r3m
     movifnidn r4d, r4m
+    movifnidn r5d, r5m
 %endmacro
 
 %macro WEIGHT_SETUP 1
     mova       m0, [pw_1]
-    movd       m2, r2m
+    movd       m2, r3m
     pslld      m0, m2       ; 1<<log2_denom
     SPLATW     m0, m0
-    shl        r4, 19       ; *8, move to upper half of dword
-    lea        r4, [r4+r3*2+0x10000]
-    movd       m3, r4d      ; weight<<1 | 1+(offset<<(3))
+    shl        r5, 19       ; *8, move to upper half of dword
+    lea        r5, [r5+r4*2+0x10000]
+    movd       m3, r5d      ; weight<<1 | 1+(offset<<(3))
     pshufd     m3, m3, 0
     mova       m4, [pw_pixel_max]
     paddw      m2, [sq_1]   ; log2_denom+1
@@ -96,8 +89,8 @@ DECLARE_REG_TMP 10
 %endmacro
 
 %macro WEIGHT_FUNC_DBL 1
-cglobal h264_weight_16x16_10_%1
-    WEIGHT_PROLOGUE 16
+cglobal h264_weight_16_10_%1
+    WEIGHT_PROLOGUE
     WEIGHT_SETUP %1
 .nextrow
     WEIGHT_OP %1,  0
@@ -105,13 +98,9 @@ cglobal h264_weight_16x16_10_%1
     WEIGHT_OP %1, 16
     mova [r0+16], m5
     add       r0, r1
-    dec       t0
+    dec       r2d
     jnz .nextrow
     REP_RET
-
-cglobal h264_weight_16x8_10_%1
-    mov t0, 8
-    jmp mangle(ff_h264_weight_16x16_10_%1.prologue)
 %endmacro
 
 INIT_XMM
@@ -120,24 +109,16 @@ WEIGHT_FUNC_DBL sse4
 
 
 %macro WEIGHT_FUNC_MM 1
-cglobal h264_weight_8x16_10_%1
-    WEIGHT_PROLOGUE 16
+cglobal h264_weight_8_10_%1
+    WEIGHT_PROLOGUE
     WEIGHT_SETUP %1
 .nextrow
     WEIGHT_OP  %1, 0
     mova     [r0], m5
     add        r0, r1
-    dec        t0
+    dec        r2d
     jnz .nextrow
     REP_RET
-
-cglobal h264_weight_8x8_10_%1
-    mov t0, 8
-    jmp mangle(ff_h264_weight_8x16_10_%1.prologue)
-
-cglobal h264_weight_8x4_10_%1
-    mov t0, 4
-    jmp mangle(ff_h264_weight_8x16_10_%1.prologue)
 %endmacro
 
 INIT_XMM
@@ -146,8 +127,9 @@ WEIGHT_FUNC_MM sse4
 
 
 %macro WEIGHT_FUNC_HALF_MM 1
-cglobal h264_weight_4x8_10_%1
-    WEIGHT_PROLOGUE 4
+cglobal h264_weight_4_10_%1
+    WEIGHT_PROLOGUE
+    sar         r2d, 1
     WEIGHT_SETUP %1
     lea         r3, [r1*2]
 .nextrow
@@ -155,17 +137,9 @@ cglobal h264_weight_4x8_10_%1
     movh      [r0], m5
     movhps [r0+r1], m5
     add         r0, r3
-    dec         t0
+    dec         r2d
     jnz .nextrow
     REP_RET
-
-cglobal h264_weight_4x4_10_%1
-    mov t0, 2
-    jmp mangle(ff_h264_weight_4x8_10_%1.prologue)
-
-cglobal h264_weight_4x2_10_%1
-    mov t0, 1
-    jmp mangle(ff_h264_weight_4x8_10_%1.prologue)
 %endmacro
 
 INIT_XMM
@@ -174,40 +148,40 @@ WEIGHT_FUNC_HALF_MM sse4
 
 
 ;-----------------------------------------------------------------------------
-; void h264_biweight(uint8_t *dst, uint8_t *src, int stride, int log2_denom,
-;                    int weightd, int weights, int offset);
+; void h264_biweight(uint8_t *dst, uint8_t *src, int stride, int height,
+;                    int log2_denom, int weightd, int weights, int offset);
 ;-----------------------------------------------------------------------------
 %ifdef ARCH_X86_32
-DECLARE_REG_TMP 2,3
+DECLARE_REG_TMP 3
 %else
-DECLARE_REG_TMP 10,2
+DECLARE_REG_TMP 10
 %endif
 
-%macro BIWEIGHT_PROLOGUE 1
-    mov t0, %1
+%macro BIWEIGHT_PROLOGUE 0
 .prologue
     PROLOGUE 0,7,8
     movifnidn  r0, r0mp
     movifnidn  r1, r1mp
-    movifnidn t1d, r2m
-    movifnidn r4d, r4m
+    movifnidn r2d, r2m
     movifnidn r5d, r5m
     movifnidn r6d, r6m
+    movifnidn t0d, r7m
 %endmacro
 
 %macro BIWEIGHT_SETUP 1
-    lea        r6, [r6*4+1] ; (offset<<2)+1
-    or         r6, 1
-    shl        r5, 16
-    or         r4, r5
-    movd       m4, r4d      ; weightd | weights
-    movd       m5, r6d      ; (offset+1)|1
-    movd       m6, r3m      ; log2_denom
+    lea        t0, [t0*4+1] ; (offset<<2)+1
+    or         t0, 1
+    shl        r6, 16
+    or         r5, r6
+    movd       m4, r5d      ; weightd | weights
+    movd       m5, t0d      ; (offset+1)|1
+    movd       m6, r4m      ; log2_denom
     pslld      m5, m6       ; (((offset<<2)+1)|1)<<log2_denom
     paddd      m6, [sq_1]
     pshufd     m4, m4, 0
     pshufd     m5, m5, 0
     mova       m3, [pw_pixel_max]
+    movifnidn r3d, r3m
 %ifnidn %1, sse4
     pxor       m7, m7
 %endif
@@ -243,23 +217,19 @@ DECLARE_REG_TMP 10,2
 %endmacro
 
 %macro BIWEIGHT_FUNC_DBL 1
-cglobal h264_biweight_16x16_10_%1
-    BIWEIGHT_PROLOGUE 16
+cglobal h264_biweight_16_10_%1
+    BIWEIGHT_PROLOGUE
     BIWEIGHT_SETUP %1
 .nextrow
     BIWEIGHT  %1,  0
     mova [r0   ], m0
     BIWEIGHT  %1, 16
     mova [r0+16], m0
-    add       r0, t1
-    add       r1, t1
-    dec       t0
+    add       r0, r2
+    add       r1, r2
+    dec       r3d
     jnz .nextrow
     REP_RET
-
-cglobal h264_biweight_16x8_10_%1
-    mov t0, 8
-    jmp mangle(ff_h264_biweight_16x16_10_%1.prologue)
 %endmacro
 
 INIT_XMM
@@ -267,25 +237,17 @@ BIWEIGHT_FUNC_DBL sse2
 BIWEIGHT_FUNC_DBL sse4
 
 %macro BIWEIGHT_FUNC 1
-cglobal h264_biweight_8x16_10_%1
-    BIWEIGHT_PROLOGUE 16
+cglobal h264_biweight_8_10_%1
+    BIWEIGHT_PROLOGUE
     BIWEIGHT_SETUP %1
 .nextrow
     BIWEIGHT %1, 0
     mova   [r0], m0
-    add      r0, t1
-    add      r1, t1
-    dec      t0
+    add      r0, r2
+    add      r1, r2
+    dec      r3d
     jnz .nextrow
     REP_RET
-
-cglobal h264_biweight_8x8_10_%1
-    mov t0, 8
-    jmp mangle(ff_h264_biweight_8x16_10_%1.prologue)
-
-cglobal h264_biweight_8x4_10_%1
-    mov t0, 4
-    jmp mangle(ff_h264_biweight_8x16_10_%1.prologue)
 %endmacro
 
 INIT_XMM
@@ -293,27 +255,20 @@ BIWEIGHT_FUNC sse2
 BIWEIGHT_FUNC sse4
 
 %macro BIWEIGHT_FUNC_HALF 1
-cglobal h264_biweight_4x8_10_%1
-    BIWEIGHT_PROLOGUE 4
+cglobal h264_biweight_4_10_%1
+    BIWEIGHT_PROLOGUE
     BIWEIGHT_SETUP %1
-    lea        r4, [t1*2]
+    sar        r3d, 1
+    lea        r4, [r2*2]
 .nextrow
-    BIWEIGHT    %1, 0, t1
+    BIWEIGHT    %1, 0, r2
     movh   [r0   ], m0
-    movhps [r0+t1], m0
+    movhps [r0+r2], m0
     add         r0, r4
     add         r1, r4
-    dec         t0
+    dec         r3d
     jnz .nextrow
     REP_RET
-
-cglobal h264_biweight_4x4_10_%1
-    mov t0, 2
-    jmp mangle(ff_h264_biweight_4x8_10_%1.prologue)
-
-cglobal h264_biweight_4x2_10_%1
-    mov t0, 1
-    jmp mangle(ff_h264_biweight_4x8_10_%1.prologue)
 %endmacro
 
 INIT_XMM
diff --git a/libavcodec/x86/h264dsp_mmx.c b/libavcodec/x86/h264dsp_mmx.c
index 06ee7cad43..dcd918013c 100644
--- a/libavcodec/x86/h264dsp_mmx.c
+++ b/libavcodec/x86/h264dsp_mmx.c
@@ -298,57 +298,47 @@ LF_IFUNC(v,  luma_intra,      10, mmxext)
 /***********************************/
 /* weighted prediction */
 
-#define H264_WEIGHT(W, H, OPT) \
-void ff_h264_weight_ ## W ## x ## H ## _ ## OPT(uint8_t *dst, \
-    int stride, int log2_denom, int weight, int offset);
+#define H264_WEIGHT(W, OPT) \
+void ff_h264_weight_ ## W ## _ ## OPT(uint8_t *dst, \
+    int stride, int height, int log2_denom, int weight, int offset);
 
-#define H264_BIWEIGHT(W, H, OPT) \
-void ff_h264_biweight_ ## W ## x ## H ## _ ## OPT(uint8_t *dst, \
-    uint8_t *src, int stride, int log2_denom, int weightd, \
+#define H264_BIWEIGHT(W, OPT) \
+void ff_h264_biweight_ ## W ## _ ## OPT(uint8_t *dst, \
+    uint8_t *src, int stride, int height, int log2_denom, int weightd, \
     int weights, int offset);
 
-#define H264_BIWEIGHT_MMX(W,H) \
-H264_WEIGHT  (W, H, mmx2) \
-H264_BIWEIGHT(W, H, mmx2)
-
-#define H264_BIWEIGHT_MMX_SSE(W,H) \
-H264_BIWEIGHT_MMX(W, H) \
-H264_WEIGHT      (W, H, sse2) \
-H264_BIWEIGHT    (W, H, sse2) \
-H264_BIWEIGHT    (W, H, ssse3)
-
-H264_BIWEIGHT_MMX_SSE(16, 16)
-H264_BIWEIGHT_MMX_SSE(16,  8)
-H264_BIWEIGHT_MMX_SSE( 8, 16)
-H264_BIWEIGHT_MMX_SSE( 8,  8)
-H264_BIWEIGHT_MMX_SSE( 8,  4)
-H264_BIWEIGHT_MMX    ( 4,  8)
-H264_BIWEIGHT_MMX    ( 4,  4)
-H264_BIWEIGHT_MMX    ( 4,  2)
-
-#define H264_WEIGHT_10(W, H, DEPTH, OPT) \
-void ff_h264_weight_ ## W ## x ## H ## _ ## DEPTH ## _ ## OPT(uint8_t *dst, \
-    int stride, int log2_denom, int weight, int offset);
-
-#define H264_BIWEIGHT_10(W, H, DEPTH, OPT) \
-void ff_h264_biweight_ ## W ## x ## H ## _ ## DEPTH ## _ ## OPT \
-    (uint8_t *dst, uint8_t *src, int stride, int log2_denom, \
+#define H264_BIWEIGHT_MMX(W) \
+H264_WEIGHT  (W, mmx2) \
+H264_BIWEIGHT(W, mmx2)
+
+#define H264_BIWEIGHT_MMX_SSE(W) \
+H264_BIWEIGHT_MMX(W) \
+H264_WEIGHT      (W, sse2) \
+H264_BIWEIGHT    (W, sse2) \
+H264_BIWEIGHT    (W, ssse3)
+
+H264_BIWEIGHT_MMX_SSE(16)
+H264_BIWEIGHT_MMX_SSE( 8)
+H264_BIWEIGHT_MMX    ( 4)
+
+#define H264_WEIGHT_10(W, DEPTH, OPT) \
+void ff_h264_weight_ ## W ## _ ## DEPTH ## _ ## OPT(uint8_t *dst, \
+    int stride, int height, int log2_denom, int weight, int offset);
+
+#define H264_BIWEIGHT_10(W, DEPTH, OPT) \
+void ff_h264_biweight_ ## W ## _ ## DEPTH ## _ ## OPT \
+    (uint8_t *dst, uint8_t *src, int stride, int height, int log2_denom, \
      int weightd, int weights, int offset);
 
-#define H264_BIWEIGHT_10_SSE(W, H, DEPTH) \
-H264_WEIGHT_10  (W, H, DEPTH, sse2) \
-H264_WEIGHT_10  (W, H, DEPTH, sse4) \
-H264_BIWEIGHT_10(W, H, DEPTH, sse2) \
-H264_BIWEIGHT_10(W, H, DEPTH, sse4)
-
-H264_BIWEIGHT_10_SSE(16, 16, 10)
-H264_BIWEIGHT_10_SSE(16,  8, 10)
-H264_BIWEIGHT_10_SSE( 8, 16, 10)
-H264_BIWEIGHT_10_SSE( 8,  8, 10)
-H264_BIWEIGHT_10_SSE( 8,  4, 10)
-H264_BIWEIGHT_10_SSE( 4,  8, 10)
-H264_BIWEIGHT_10_SSE( 4,  4, 10)
-H264_BIWEIGHT_10_SSE( 4,  2, 10)
+#define H264_BIWEIGHT_10_SSE(W, DEPTH) \
+H264_WEIGHT_10  (W, DEPTH, sse2) \
+H264_WEIGHT_10  (W, DEPTH, sse4) \
+H264_BIWEIGHT_10(W, DEPTH, sse2) \
+H264_BIWEIGHT_10(W, DEPTH, sse4)
+
+H264_BIWEIGHT_10_SSE(16, 10)
+H264_BIWEIGHT_10_SSE( 8, 10)
+H264_BIWEIGHT_10_SSE( 4, 10)
 
 void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, const int chroma_format_idc)
 {
@@ -394,23 +384,13 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, const int chrom
             c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_mmxext;
             c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_mmxext;
 #endif
-            c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_mmx2;
-            c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_mmx2;
-            c->weight_h264_pixels_tab[2]= ff_h264_weight_8x16_mmx2;
-            c->weight_h264_pixels_tab[3]= ff_h264_weight_8x8_mmx2;
-            c->weight_h264_pixels_tab[4]= ff_h264_weight_8x4_mmx2;
-            c->weight_h264_pixels_tab[5]= ff_h264_weight_4x8_mmx2;
-            c->weight_h264_pixels_tab[6]= ff_h264_weight_4x4_mmx2;
-            c->weight_h264_pixels_tab[7]= ff_h264_weight_4x2_mmx2;
-
-            c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_mmx2;
-            c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_mmx2;
-            c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_mmx2;
-            c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_mmx2;
-            c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_mmx2;
-            c->biweight_h264_pixels_tab[5]= ff_h264_biweight_4x8_mmx2;
-            c->biweight_h264_pixels_tab[6]= ff_h264_biweight_4x4_mmx2;
-            c->biweight_h264_pixels_tab[7]= ff_h264_biweight_4x2_mmx2;
+            c->weight_h264_pixels_tab[0]= ff_h264_weight_16_mmx2;
+            c->weight_h264_pixels_tab[1]= ff_h264_weight_8_mmx2;
+            c->weight_h264_pixels_tab[2]= ff_h264_weight_4_mmx2;
+
+            c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16_mmx2;
+            c->biweight_h264_pixels_tab[1]= ff_h264_biweight_8_mmx2;
+            c->biweight_h264_pixels_tab[2]= ff_h264_biweight_4_mmx2;
 
             if (mm_flags&AV_CPU_FLAG_SSE2) {
                 c->h264_idct8_add           = ff_h264_idct8_add_8_sse2;
@@ -422,17 +402,11 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, const int chrom
                 c->h264_idct_add16intra     = ff_h264_idct_add16intra_8_sse2;
                 c->h264_luma_dc_dequant_idct= ff_h264_luma_dc_dequant_idct_sse2;
 
-                c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_sse2;
-                c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_sse2;
-                c->weight_h264_pixels_tab[2]= ff_h264_weight_8x16_sse2;
-                c->weight_h264_pixels_tab[3]= ff_h264_weight_8x8_sse2;
-                c->weight_h264_pixels_tab[4]= ff_h264_weight_8x4_sse2;
+                c->weight_h264_pixels_tab[0]= ff_h264_weight_16_sse2;
+                c->weight_h264_pixels_tab[1]= ff_h264_weight_8_sse2;
 
-                c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_sse2;
-                c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_sse2;
-                c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_sse2;
-                c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_sse2;
-                c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_sse2;
+                c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16_sse2;
+                c->biweight_h264_pixels_tab[1]= ff_h264_biweight_8_sse2;
 
 #if HAVE_ALIGNED_STACK
                 c->h264_v_loop_filter_luma = ff_deblock_v_luma_8_sse2;
@@ -442,11 +416,8 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, const int chrom
 #endif
             }
             if (mm_flags&AV_CPU_FLAG_SSSE3) {
-                c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_ssse3;
-                c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_ssse3;
-                c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_ssse3;
-                c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_ssse3;
-                c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_ssse3;
+                c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16_ssse3;
+                c->biweight_h264_pixels_tab[1]= ff_h264_biweight_8_ssse3;
             }
             if (mm_flags&AV_CPU_FLAG_AVX) {
 #if HAVE_ALIGNED_STACK
@@ -485,23 +456,13 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, const int chrom
                 c->h264_idct8_add4     = ff_h264_idct8_add4_10_sse2;
 #endif
 
-                c->weight_h264_pixels_tab[0] = ff_h264_weight_16x16_10_sse2;
-                c->weight_h264_pixels_tab[1] = ff_h264_weight_16x8_10_sse2;
-                c->weight_h264_pixels_tab[2] = ff_h264_weight_8x16_10_sse2;
-                c->weight_h264_pixels_tab[3] = ff_h264_weight_8x8_10_sse2;
-                c->weight_h264_pixels_tab[4] = ff_h264_weight_8x4_10_sse2;
-                c->weight_h264_pixels_tab[5] = ff_h264_weight_4x8_10_sse2;
-                c->weight_h264_pixels_tab[6] = ff_h264_weight_4x4_10_sse2;
-                c->weight_h264_pixels_tab[7] = ff_h264_weight_4x2_10_sse2;
-
-                c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16x16_10_sse2;
-                c->biweight_h264_pixels_tab[1] = ff_h264_biweight_16x8_10_sse2;
-                c->biweight_h264_pixels_tab[2] = ff_h264_biweight_8x16_10_sse2;
-                c->biweight_h264_pixels_tab[3] = ff_h264_biweight_8x8_10_sse2;
-                c->biweight_h264_pixels_tab[4] = ff_h264_biweight_8x4_10_sse2;
-                c->biweight_h264_pixels_tab[5] = ff_h264_biweight_4x8_10_sse2;
-                c->biweight_h264_pixels_tab[6] = ff_h264_biweight_4x4_10_sse2;
-                c->biweight_h264_pixels_tab[7] = ff_h264_biweight_4x2_10_sse2;
+                c->weight_h264_pixels_tab[0] = ff_h264_weight_16_10_sse2;
+                c->weight_h264_pixels_tab[1] = ff_h264_weight_8_10_sse2;
+                c->weight_h264_pixels_tab[2] = ff_h264_weight_4_10_sse2;
+
+                c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_10_sse2;
+                c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_10_sse2;
+                c->biweight_h264_pixels_tab[2] = ff_h264_biweight_4_10_sse2;
 
                 c->h264_v_loop_filter_chroma= ff_deblock_v_chroma_10_sse2;
                 c->h264_v_loop_filter_chroma_intra= ff_deblock_v_chroma_intra_10_sse2;
@@ -513,23 +474,13 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, const int chrom
 #endif
             }
             if (mm_flags&AV_CPU_FLAG_SSE4) {
-                c->weight_h264_pixels_tab[0] = ff_h264_weight_16x16_10_sse4;
-                c->weight_h264_pixels_tab[1] = ff_h264_weight_16x8_10_sse4;
-                c->weight_h264_pixels_tab[2] = ff_h264_weight_8x16_10_sse4;
-                c->weight_h264_pixels_tab[3] = ff_h264_weight_8x8_10_sse4;
-                c->weight_h264_pixels_tab[4] = ff_h264_weight_8x4_10_sse4;
-                c->weight_h264_pixels_tab[5] = ff_h264_weight_4x8_10_sse4;
-                c->weight_h264_pixels_tab[6] = ff_h264_weight_4x4_10_sse4;
-                c->weight_h264_pixels_tab[7] = ff_h264_weight_4x2_10_sse4;
-
-                c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16x16_10_sse4;
-                c->biweight_h264_pixels_tab[1] = ff_h264_biweight_16x8_10_sse4;
-                c->biweight_h264_pixels_tab[2] = ff_h264_biweight_8x16_10_sse4;
-                c->biweight_h264_pixels_tab[3] = ff_h264_biweight_8x8_10_sse4;
-                c->biweight_h264_pixels_tab[4] = ff_h264_biweight_8x4_10_sse4;
-                c->biweight_h264_pixels_tab[5] = ff_h264_biweight_4x8_10_sse4;
-                c->biweight_h264_pixels_tab[6] = ff_h264_biweight_4x4_10_sse4;
-                c->biweight_h264_pixels_tab[7] = ff_h264_biweight_4x2_10_sse4;
+                c->weight_h264_pixels_tab[0] = ff_h264_weight_16_10_sse4;
+                c->weight_h264_pixels_tab[1] = ff_h264_weight_8_10_sse4;
+                c->weight_h264_pixels_tab[2] = ff_h264_weight_4_10_sse4;
+
+                c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_10_sse4;
+                c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_10_sse4;
+                c->biweight_h264_pixels_tab[2] = ff_h264_biweight_4_10_sse4;
             }
 #if HAVE_AVX
             if (mm_flags&AV_CPU_FLAG_AVX) {