From 8305041e137f4f2a49669dd588bf6ccfbbac2b58 Mon Sep 17 00:00:00 2001
From: "Ronald S. Bultje" <rsbultje@gmail.com>
Date: Wed, 19 Oct 2011 19:56:56 -0700
Subject: [PATCH 01/35] swscale: prevent overflow in coefficient calculation.

---
 libswscale/utils.c | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/libswscale/utils.c b/libswscale/utils.c
index 8e5daf99dc..ea5a1ab468 100644
--- a/libswscale/utils.c
+++ b/libswscale/utils.c
@@ -271,19 +271,20 @@ static int initFilter(int16_t **outFilter, int16_t **filterPos, int *outFilterSi
                 floatd= d * (1.0/(1<<30));
 
                 if (flags & SWS_BICUBIC) {
-#define SQRT_INT64_MAX 0xb504f333
                     int64_t B= (param[0] != SWS_PARAM_DEFAULT ? param[0] :   0) * (1<<24);
                     int64_t C= (param[1] != SWS_PARAM_DEFAULT ? param[1] : 0.6) * (1<<24);
-                    int64_t dd  = d > SQRT_INT64_MAX ? ((d  >> 1) * d) >> 29 : (d  * d) >> 30;
-                    int64_t ddd = d > SQRT_INT64_MAX || dd > SQRT_INT64_MAX ?
-                                                       ((dd >> 2) * d) >> 28 : (dd * d) >> 30;
-
-                    if      (d < 1LL<<30)
-                        coeff = (12*(1<<24)-9*B-6*C)*ddd + (-18*(1<<24)+12*B+6*C)*dd + (6*(1<<24)-2*B)*(1<<30);
-                    else if (d < 1LL<<31)
-                        coeff = (-B-6*C)*ddd + (6*B+30*C)*dd + (-12*B-48*C)*d + (8*B+24*C)*(1<<30);
-                    else
-                        coeff=0.0;
+
+                    if (d >= 1LL<<31) {
+                        coeff = 0.0;
+                    } else {
+                        int64_t dd  = (d  * d) >> 30;
+                        int64_t ddd = (dd * d) >> 30;
+
+                        if (d < 1LL<<30)
+                            coeff = (12*(1<<24)-9*B-6*C)*ddd + (-18*(1<<24)+12*B+6*C)*dd + (6*(1<<24)-2*B)*(1<<30);
+                        else
+                            coeff = (-B-6*C)*ddd + (6*B+30*C)*dd + (-12*B-48*C)*d + (8*B+24*C)*(1<<30);
+                    }
                     coeff *= fone>>(30+24);
                 }
 /*                else if (flags & SWS_X) {

From ce42a04884cd6585c596f1ecfe737dacc3e6f396 Mon Sep 17 00:00:00 2001
From: "Ronald S. Bultje" <rsbultje@gmail.com>
Date: Mon, 17 Oct 2011 17:10:16 -0700
Subject: [PATCH 02/35] vp8: fix up handling of segmentation_maps in reference
 frames.

Associate segmentation_map[] with reference frame, rather than
decoding instance. This fixes cases where the map would be free()'ed
on e.g. a size change in one thread, whereas the other thread was
still accessing it. Also, it fixes cases where threads overwrite data
that is still being referenced by the previous thread, who thinks that
it's part of the frame previously decoded by the next thread.
---
 libavcodec/vp8.c | 68 +++++++++++++++++++++++++++++++++++++++---------
 libavcodec/vp8.h | 11 +++++++-
 2 files changed, 65 insertions(+), 14 deletions(-)

diff --git a/libavcodec/vp8.c b/libavcodec/vp8.c
index 7442b99252..d5cdaba486 100644
--- a/libavcodec/vp8.c
+++ b/libavcodec/vp8.c
@@ -41,24 +41,57 @@ static void free_buffers(VP8Context *s)
     av_freep(&s->top_nnz);
     av_freep(&s->edge_emu_buffer);
     av_freep(&s->top_border);
-    av_freep(&s->segmentation_map);
 
     s->macroblocks = NULL;
 }
 
-static void vp8_decode_flush(AVCodecContext *avctx)
+static int vp8_alloc_frame(VP8Context *s, AVFrame *f)
+{
+    int ret;
+    if ((ret = ff_thread_get_buffer(s->avctx, f)) < 0)
+        return ret;
+    if (!s->maps_are_invalid && s->num_maps_to_be_freed) {
+        f->ref_index[0] = s->segmentation_maps[--s->num_maps_to_be_freed];
+    } else if (!(f->ref_index[0] = av_mallocz(s->mb_width * s->mb_height))) {
+        ff_thread_release_buffer(s->avctx, f);
+        return AVERROR(ENOMEM);
+    }
+    return 0;
+}
+
+static void vp8_release_frame(VP8Context *s, AVFrame *f, int is_close)
+{
+    if (!is_close) {
+        if (f->ref_index[0]) {
+            assert(s->num_maps_to_be_freed < FF_ARRAY_ELEMS(s->segmentation_maps));
+            s->segmentation_maps[s->num_maps_to_be_freed++] = f->ref_index[0];
+            f->ref_index[0] = NULL;
+        }
+    } else {
+        av_freep(&f->ref_index[0]);
+    }
+    ff_thread_release_buffer(s->avctx, f);
+}
+
+static void vp8_decode_flush_impl(AVCodecContext *avctx, int force, int is_close)
 {
     VP8Context *s = avctx->priv_data;
     int i;
 
-    if (!avctx->is_copy) {
+    if (!avctx->is_copy || force) {
         for (i = 0; i < 5; i++)
             if (s->frames[i].data[0])
-                ff_thread_release_buffer(avctx, &s->frames[i]);
+                vp8_release_frame(s, &s->frames[i], is_close);
     }
     memset(s->framep, 0, sizeof(s->framep));
 
     free_buffers(s);
+    s->maps_are_invalid = 1;
+}
+
+static void vp8_decode_flush(AVCodecContext *avctx)
+{
+    vp8_decode_flush_impl(avctx, 0, 0);
 }
 
 static int update_dimensions(VP8Context *s, int width, int height)
@@ -68,7 +101,7 @@ static int update_dimensions(VP8Context *s, int width, int height)
         if (av_image_check_size(width, height, 0, s->avctx))
             return AVERROR_INVALIDDATA;
 
-        vp8_decode_flush(s->avctx);
+        vp8_decode_flush_impl(s->avctx, 1, 0);
 
         avcodec_set_dimensions(s->avctx, width, height);
     }
@@ -81,10 +114,9 @@ static int update_dimensions(VP8Context *s, int width, int height)
     s->intra4x4_pred_mode_top  = av_mallocz(s->mb_width*4);
     s->top_nnz                 = av_mallocz(s->mb_width*sizeof(*s->top_nnz));
     s->top_border              = av_mallocz((s->mb_width+1)*sizeof(*s->top_border));
-    s->segmentation_map        = av_mallocz(s->mb_width*s->mb_height);
 
     if (!s->macroblocks_base || !s->filter_strength || !s->intra4x4_pred_mode_top ||
-        !s->top_nnz || !s->top_border || !s->segmentation_map)
+        !s->top_nnz || !s->top_border)
         return AVERROR(ENOMEM);
 
     s->macroblocks        = s->macroblocks_base + 1;
@@ -1508,6 +1540,14 @@ static void filter_mb_row_simple(VP8Context *s, AVFrame *curframe, int mb_y)
     }
 }
 
+static void release_queued_segmaps(VP8Context *s, int is_close)
+{
+    int leave_behind = is_close ? 0 : !s->maps_are_invalid;
+    while (s->num_maps_to_be_freed > leave_behind)
+        av_freep(&s->segmentation_maps[--s->num_maps_to_be_freed]);
+    s->maps_are_invalid = 0;
+}
+
 static int vp8_decode_frame(AVCodecContext *avctx, void *data, int *data_size,
                             AVPacket *avpkt)
 {
@@ -1516,6 +1556,8 @@ static int vp8_decode_frame(AVCodecContext *avctx, void *data, int *data_size,
     enum AVDiscard skip_thresh;
     AVFrame *av_uninit(curframe), *prev_frame = s->framep[VP56_FRAME_CURRENT];
 
+    release_queued_segmaps(s, 0);
+
     if ((ret = decode_frame_header(s, avpkt->data, avpkt->size)) < 0)
         return ret;
 
@@ -1538,7 +1580,7 @@ static int vp8_decode_frame(AVCodecContext *avctx, void *data, int *data_size,
             &s->frames[i] != s->framep[VP56_FRAME_PREVIOUS] &&
             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN] &&
             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN2])
-            ff_thread_release_buffer(avctx, &s->frames[i]);
+            vp8_release_frame(s, &s->frames[i], 0);
 
     // find a free buffer
     for (i = 0; i < 5; i++)
@@ -1559,8 +1601,7 @@ static int vp8_decode_frame(AVCodecContext *avctx, void *data, int *data_size,
     curframe->key_frame = s->keyframe;
     curframe->pict_type = s->keyframe ? AV_PICTURE_TYPE_I : AV_PICTURE_TYPE_P;
     curframe->reference = referenced ? 3 : 0;
-    curframe->ref_index[0] = s->segmentation_map;
-    if ((ret = ff_thread_get_buffer(avctx, curframe))) {
+    if ((ret = vp8_alloc_frame(s, curframe))) {
         av_log(avctx, AV_LOG_ERROR, "get_buffer() failed!\n");
         return ret;
     }
@@ -1652,8 +1693,8 @@ static int vp8_decode_frame(AVCodecContext *avctx, void *data, int *data_size,
             s->dsp.prefetch(dst[0] + (mb_x&3)*4*s->linesize + 64, s->linesize, 4);
             s->dsp.prefetch(dst[1] + (mb_x&7)*s->uvlinesize + 64, dst[2] - dst[1], 2);
 
-            decode_mb_mode(s, mb, mb_x, mb_y, s->segmentation_map + mb_xy,
-                           prev_frame ? prev_frame->ref_index[0] + mb_xy : NULL);
+            decode_mb_mode(s, mb, mb_x, mb_y, curframe->ref_index[0] + mb_xy,
+                           prev_frame && prev_frame->ref_index[0] ? prev_frame->ref_index[0] + mb_xy : NULL);
 
             prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_PREVIOUS);
 
@@ -1736,7 +1777,8 @@ static av_cold int vp8_decode_init(AVCodecContext *avctx)
 
 static av_cold int vp8_decode_free(AVCodecContext *avctx)
 {
-    vp8_decode_flush(avctx);
+    vp8_decode_flush_impl(avctx, 0, 1);
+    release_queued_segmaps(avctx->priv_data, 1);
     return 0;
 }
 
diff --git a/libavcodec/vp8.h b/libavcodec/vp8.h
index 5a96cd436c..6cbdca2d88 100644
--- a/libavcodec/vp8.h
+++ b/libavcodec/vp8.h
@@ -130,7 +130,6 @@ typedef struct {
 
     uint8_t *intra4x4_pred_mode_top;
     uint8_t intra4x4_pred_mode_left[4];
-    uint8_t *segmentation_map;
 
     /**
      * Macroblocks can have one of 4 different quants in a frame when
@@ -237,6 +236,16 @@ typedef struct {
     H264PredContext hpc;
     vp8_mc_func put_pixels_tab[3][3][3];
     AVFrame frames[5];
+
+    /**
+     * A list of segmentation_map buffers that are to be free()'ed in
+     * the next decoding iteration. We can't free() them right away
+     * because the map may still be used by subsequent decoding threads.
+     * Unused if frame threading is off.
+     */
+    uint8_t *segmentation_maps[5];
+    int num_maps_to_be_freed;
+    int maps_are_invalid;
 } VP8Context;
 
 #endif /* AVCODEC_VP8_H */

From e85297e0e7396c9724e57ac949f10e5eb345bf54 Mon Sep 17 00:00:00 2001
From: "Ronald S. Bultje" <rsbultje@gmail.com>
Date: Fri, 21 Oct 2011 00:21:31 -0700
Subject: [PATCH 03/35] Revert d1d421cbc0d13b08535f7fc08d179572ee352072: change
 to fate-lavfi-crop_scale.

---
 tests/ref/lavfi/crop_scale | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/ref/lavfi/crop_scale b/tests/ref/lavfi/crop_scale
index ae26c8ee6d..82e5394496 100644
--- a/tests/ref/lavfi/crop_scale
+++ b/tests/ref/lavfi/crop_scale
@@ -1 +1 @@
-crop_scale          f8cad857d2b7102fc256532ec9849da7
+crop_scale          0a3d45d58b805b8c47416b9239535f94

From dc49bf127010fdff2c3282755407cedd429475f5 Mon Sep 17 00:00:00 2001
From: "Ronald S. Bultje" <rsbultje@gmail.com>
Date: Fri, 21 Oct 2011 00:38:04 -0700
Subject: [PATCH 04/35] sws/pixfmt/pixdesc: add support for yuv420p9le/be.

---
 libavcodec/utils.c            |  2 ++
 libavutil/pixdesc.c           | 23 +++++++++++++++++++++++
 libavutil/pixfmt.h            |  3 +++
 libswscale/swscale.c          |  4 ++++
 libswscale/swscale_internal.h |  4 ++++
 libswscale/utils.c            |  2 ++
 tests/ref/lavfi/pixdesc       |  2 ++
 tests/ref/lavfi/pixfmts_copy  |  2 ++
 tests/ref/lavfi/pixfmts_null  |  2 ++
 tests/ref/lavfi/pixfmts_scale |  2 ++
 tests/ref/lavfi/pixfmts_vflip |  2 ++
 11 files changed, 48 insertions(+)

diff --git a/libavcodec/utils.c b/libavcodec/utils.c
index c1a5c19e04..3e4926273f 100644
--- a/libavcodec/utils.c
+++ b/libavcodec/utils.c
@@ -158,6 +158,8 @@ void avcodec_align_dimensions2(AVCodecContext *s, int *width, int *height, int l
     case PIX_FMT_YUV420P9BE:
     case PIX_FMT_YUV420P10LE:
     case PIX_FMT_YUV420P10BE:
+    case PIX_FMT_YUV422P9LE:
+    case PIX_FMT_YUV422P9BE:
     case PIX_FMT_YUV422P10LE:
     case PIX_FMT_YUV422P10BE:
     case PIX_FMT_YUV444P9LE:
diff --git a/libavutil/pixdesc.c b/libavutil/pixdesc.c
index c70a41347b..f7df0eba65 100644
--- a/libavutil/pixdesc.c
+++ b/libavutil/pixdesc.c
@@ -809,6 +809,29 @@ const AVPixFmtDescriptor av_pix_fmt_descriptors[PIX_FMT_NB] = {
         },
         .flags = PIX_FMT_BE,
     },
+    [PIX_FMT_YUV422P9LE] = {
+        .name = "yuv422p9le",
+        .nb_components= 3,
+        .log2_chroma_w= 1,
+        .log2_chroma_h= 0,
+        .comp = {
+            {0,1,1,0,8},        /* Y */
+            {1,1,1,0,8},        /* U */
+            {2,1,1,0,8},        /* V */
+        },
+    },
+    [PIX_FMT_YUV422P9BE] = {
+        .name = "yuv422p9be",
+        .nb_components= 3,
+        .log2_chroma_w= 1,
+        .log2_chroma_h= 0,
+        .comp = {
+            {0,1,1,0,8},        /* Y */
+            {1,1,1,0,8},        /* U */
+            {2,1,1,0,8},        /* V */
+        },
+        .flags = PIX_FMT_BE,
+    },
     [PIX_FMT_YUV422P10LE] = {
         .name = "yuv422p10le",
         .nb_components= 3,
diff --git a/libavutil/pixfmt.h b/libavutil/pixfmt.h
index 9bf793866a..7068b43fed 100644
--- a/libavutil/pixfmt.h
+++ b/libavutil/pixfmt.h
@@ -145,6 +145,8 @@ enum PixelFormat {
     PIX_FMT_YUV444P9LE, ///< planar YUV 4:4:4, 27bpp, (1 Cr & Cb sample per 1x1 Y samples), little-endian
     PIX_FMT_YUV444P10BE,///< planar YUV 4:4:4, 30bpp, (1 Cr & Cb sample per 1x1 Y samples), big-endian
     PIX_FMT_YUV444P10LE,///< planar YUV 4:4:4, 30bpp, (1 Cr & Cb sample per 1x1 Y samples), little-endian
+    PIX_FMT_YUV422P9BE, ///< planar YUV 4:2:2, 18bpp, (1 Cr & Cb sample per 2x1 Y samples), big-endian
+    PIX_FMT_YUV422P9LE, ///< planar YUV 4:2:2, 18bpp, (1 Cr & Cb sample per 2x1 Y samples), little-endian
     PIX_FMT_NB,        ///< number of pixel formats, DO NOT USE THIS if you want to link with shared libav* because the number of formats might differ between versions
 };
 
@@ -170,6 +172,7 @@ enum PixelFormat {
 #define PIX_FMT_BGR444 PIX_FMT_NE(BGR444BE, BGR444LE)
 
 #define PIX_FMT_YUV420P9  PIX_FMT_NE(YUV420P9BE , YUV420P9LE)
+#define PIX_FMT_YUV422P9  PIX_FMT_NE(YUV422P9BE , YUV422P9LE)
 #define PIX_FMT_YUV444P9  PIX_FMT_NE(YUV444P9BE , YUV444P9LE)
 #define PIX_FMT_YUV420P10 PIX_FMT_NE(YUV420P10BE, YUV420P10LE)
 #define PIX_FMT_YUV422P10 PIX_FMT_NE(YUV422P10BE, YUV422P10LE)
diff --git a/libswscale/swscale.c b/libswscale/swscale.c
index 5d90250acf..227f65e301 100644
--- a/libswscale/swscale.c
+++ b/libswscale/swscale.c
@@ -2662,6 +2662,7 @@ static av_cold void sws_init_swScale_c(SwsContext *c)
         case PIX_FMT_RGB4_BYTE: c->chrToYV12 = palToUV_c; break;
 #if HAVE_BIGENDIAN
         case PIX_FMT_YUV444P9LE:
+        case PIX_FMT_YUV422P9LE:
         case PIX_FMT_YUV420P9LE:
         case PIX_FMT_YUV422P10LE:
         case PIX_FMT_YUV444P10LE:
@@ -2671,6 +2672,7 @@ static av_cold void sws_init_swScale_c(SwsContext *c)
         case PIX_FMT_YUV444P16LE: c->chrToYV12 = bswap16UV_c; break;
 #else
         case PIX_FMT_YUV444P9BE:
+        case PIX_FMT_YUV422P9BE:
         case PIX_FMT_YUV420P9BE:
         case PIX_FMT_YUV444P10BE:
         case PIX_FMT_YUV422P10BE:
@@ -2729,6 +2731,7 @@ static av_cold void sws_init_swScale_c(SwsContext *c)
     switch (srcFormat) {
 #if HAVE_BIGENDIAN
     case PIX_FMT_YUV444P9LE:
+    case PIX_FMT_YUV422P9LE:
     case PIX_FMT_YUV420P9LE:
     case PIX_FMT_YUV444P10LE:
     case PIX_FMT_YUV422P10LE:
@@ -2739,6 +2742,7 @@ static av_cold void sws_init_swScale_c(SwsContext *c)
     case PIX_FMT_GRAY16LE: c->lumToYV12 = bswap16Y_c; break;
 #else
     case PIX_FMT_YUV444P9BE:
+    case PIX_FMT_YUV422P9BE:
     case PIX_FMT_YUV420P9BE:
     case PIX_FMT_YUV444P10BE:
     case PIX_FMT_YUV422P10BE:
diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h
index a13b89d203..9b895b125b 100644
--- a/libswscale/swscale_internal.h
+++ b/libswscale/swscale_internal.h
@@ -535,6 +535,8 @@ const char *sws_format_name(enum PixelFormat format);
 #define is9_OR_10BPS(x) (           \
            (x)==PIX_FMT_YUV420P9LE  \
         || (x)==PIX_FMT_YUV420P9BE  \
+        || (x)==PIX_FMT_YUV422P9LE  \
+        || (x)==PIX_FMT_YUV422P9BE  \
         || (x)==PIX_FMT_YUV444P9BE  \
         || (x)==PIX_FMT_YUV444P9LE  \
         || (x)==PIX_FMT_YUV422P10BE \
@@ -559,6 +561,7 @@ const char *sws_format_name(enum PixelFormat format);
 #define isPlanarYUV(x)  (           \
         isPlanar8YUV(x)             \
         || (x)==PIX_FMT_YUV420P9LE  \
+        || (x)==PIX_FMT_YUV422P9LE  \
         || (x)==PIX_FMT_YUV444P9LE  \
         || (x)==PIX_FMT_YUV420P10LE \
         || (x)==PIX_FMT_YUV422P10LE \
@@ -567,6 +570,7 @@ const char *sws_format_name(enum PixelFormat format);
         || (x)==PIX_FMT_YUV422P16LE \
         || (x)==PIX_FMT_YUV444P16LE \
         || (x)==PIX_FMT_YUV420P9BE  \
+        || (x)==PIX_FMT_YUV422P9BE  \
         || (x)==PIX_FMT_YUV444P9BE  \
         || (x)==PIX_FMT_YUV420P10BE \
         || (x)==PIX_FMT_YUV422P10BE \
diff --git a/libswscale/utils.c b/libswscale/utils.c
index ea5a1ab468..64ac77d7b7 100644
--- a/libswscale/utils.c
+++ b/libswscale/utils.c
@@ -129,6 +129,8 @@ const static FormatEntry format_entries[PIX_FMT_NB] = {
     [PIX_FMT_YUV420P9LE]  = { 1 , 1 },
     [PIX_FMT_YUV420P10BE] = { 1 , 1 },
     [PIX_FMT_YUV420P10LE] = { 1 , 1 },
+    [PIX_FMT_YUV422P9BE]  = { 1 , 1 },
+    [PIX_FMT_YUV422P9LE]  = { 1 , 1 },
     [PIX_FMT_YUV422P10BE] = { 1 , 1 },
     [PIX_FMT_YUV422P10LE] = { 1 , 1 },
     [PIX_FMT_YUV444P9BE]  = { 1 , 1 },
diff --git a/tests/ref/lavfi/pixdesc b/tests/ref/lavfi/pixdesc
index f27d31bb00..c4d789095c 100644
--- a/tests/ref/lavfi/pixdesc
+++ b/tests/ref/lavfi/pixdesc
@@ -42,6 +42,8 @@ yuv422p10be         bdc13b630fd668b34c6fe1aae28dfc71
 yuv422p10le         d0607c260a45c973e6639f4e449730ad
 yuv422p16be         4e9b3b3467aeebb6a528cee5966800ed
 yuv422p16le         f87c81bf16916b64d201359be0b4b6f4
+yuv422p9be          29b71579946940a8c00fa844c9dff507
+yuv422p9le          062b7f9cbb972bf36b5bdb1a7623701a
 yuv440p             5a064afe2b453bb52cdb3f176b1aa1cf
 yuv444p             0a98447b78fd476aa39686da6a74fa2e
 yuv444p10be         e65cbae7e4f1892c23defbc8e8052cf6
diff --git a/tests/ref/lavfi/pixfmts_copy b/tests/ref/lavfi/pixfmts_copy
index f27d31bb00..c4d789095c 100644
--- a/tests/ref/lavfi/pixfmts_copy
+++ b/tests/ref/lavfi/pixfmts_copy
@@ -42,6 +42,8 @@ yuv422p10be         bdc13b630fd668b34c6fe1aae28dfc71
 yuv422p10le         d0607c260a45c973e6639f4e449730ad
 yuv422p16be         4e9b3b3467aeebb6a528cee5966800ed
 yuv422p16le         f87c81bf16916b64d201359be0b4b6f4
+yuv422p9be          29b71579946940a8c00fa844c9dff507
+yuv422p9le          062b7f9cbb972bf36b5bdb1a7623701a
 yuv440p             5a064afe2b453bb52cdb3f176b1aa1cf
 yuv444p             0a98447b78fd476aa39686da6a74fa2e
 yuv444p10be         e65cbae7e4f1892c23defbc8e8052cf6
diff --git a/tests/ref/lavfi/pixfmts_null b/tests/ref/lavfi/pixfmts_null
index f27d31bb00..c4d789095c 100644
--- a/tests/ref/lavfi/pixfmts_null
+++ b/tests/ref/lavfi/pixfmts_null
@@ -42,6 +42,8 @@ yuv422p10be         bdc13b630fd668b34c6fe1aae28dfc71
 yuv422p10le         d0607c260a45c973e6639f4e449730ad
 yuv422p16be         4e9b3b3467aeebb6a528cee5966800ed
 yuv422p16le         f87c81bf16916b64d201359be0b4b6f4
+yuv422p9be          29b71579946940a8c00fa844c9dff507
+yuv422p9le          062b7f9cbb972bf36b5bdb1a7623701a
 yuv440p             5a064afe2b453bb52cdb3f176b1aa1cf
 yuv444p             0a98447b78fd476aa39686da6a74fa2e
 yuv444p10be         e65cbae7e4f1892c23defbc8e8052cf6
diff --git a/tests/ref/lavfi/pixfmts_scale b/tests/ref/lavfi/pixfmts_scale
index 4af1ca43de..83b523e8a3 100644
--- a/tests/ref/lavfi/pixfmts_scale
+++ b/tests/ref/lavfi/pixfmts_scale
@@ -42,6 +42,8 @@ yuv422p10be         cea7ca6b0e66d6f29539885896c88603
 yuv422p10le         a10c4a5837547716f13cd61918b145f9
 yuv422p16be         285993ee0c0f4f8e511ee46f93c5f38c
 yuv422p16le         61bfcee8e54465f760164f5a75d40b5e
+yuv422p9be          82494823944912f73cebc58ad2979bbd
+yuv422p9le          fc69c8a21f473916a4b4225636b97e06
 yuv440p             461503fdb9b90451020aa3b25ddf041c
 yuv444p             81b2eba962d12e8d64f003ac56f6faf2
 yuv444p10be         e9d3c8e744b8b0d8187ca092fa203fc9
diff --git a/tests/ref/lavfi/pixfmts_vflip b/tests/ref/lavfi/pixfmts_vflip
index 21988f16d6..66d803cbdf 100644
--- a/tests/ref/lavfi/pixfmts_vflip
+++ b/tests/ref/lavfi/pixfmts_vflip
@@ -42,6 +42,8 @@ yuv422p10be         588fe319b96513c32e21d3e32b45447f
 yuv422p10le         11b57f2bd9661024153f3973b9090cdb
 yuv422p16be         c092d083548c2a144c372a98c46875c7
 yuv422p16le         c071b9397a416d51cbe339345cbcba84
+yuv422p9be          7c6f1e140b3999ee7d923854e507752a
+yuv422p9le          51f10d79c07989060dd06e767e6d7d60
 yuv440p             876385e96165acf51271b20e5d85a416
 yuv444p             9c3c667d1613b72d15bc6d851c5eb8f7
 yuv444p10be         944a4997c4edb3a8dd0f0493cfd5a1fd

From 76741b0e56bfbc74cfa32ff59e15cf420463569b Mon Sep 17 00:00:00 2001
From: Baptiste Coudurier <baptiste.coudurier@gmail.com>
Date: Tue, 16 Aug 2011 17:05:44 +0200
Subject: [PATCH 05/35] h264: 4:2:2 intra decoding support

Signed-off-by: Diego Biurrun <diego@biurrun.de>
Signed-off-by: Ronald S. Bultje <rsbultje@gmail.com>
---
 Changelog                            |   1 +
 libavcodec/arm/h264dsp_init_arm.c    |   9 +-
 libavcodec/arm/h264pred_init_arm.c   |   6 +-
 libavcodec/dsputil.h                 |   2 +
 libavcodec/h264.c                    |  97 ++++++++++++++-----
 libavcodec/h264.h                    |  12 +--
 libavcodec/h264_cabac.c              |  84 ++++++++++++++--
 libavcodec/h264_cavlc.c              | 127 ++++++++++++++++++++++--
 libavcodec/h264_loopfilter.c         |  61 +++++++++---
 libavcodec/h264_mvpred.h             |   7 +-
 libavcodec/h264_ps.c                 |   5 +-
 libavcodec/h264data.h                |   9 +-
 libavcodec/h264dsp.c                 |  38 ++++++--
 libavcodec/h264dsp.h                 |   8 +-
 libavcodec/h264dsp_template.c        |  16 ++++
 libavcodec/h264idct_template.c       |  50 ++++++++++
 libavcodec/h264pred.c                |  51 +++++++---
 libavcodec/h264pred.h                |   6 +-
 libavcodec/h264pred_template.c       | 138 +++++++++++++++++++++++++++
 libavcodec/ppc/h264_altivec.c        |   5 +-
 libavcodec/rv34.c                    |   2 +-
 libavcodec/vp8.c                     |   2 +-
 libavcodec/x86/h264_intrapred_init.c |  47 +++++----
 libavcodec/x86/h264dsp_mmx.c         |  23 +++--
 24 files changed, 673 insertions(+), 133 deletions(-)

diff --git a/Changelog b/Changelog
index 3041632a33..491f93b79c 100644
--- a/Changelog
+++ b/Changelog
@@ -54,6 +54,7 @@ easier to use. The changes are:
 - boxblur filter
 - Ut Video decoder
 - Speex encoding via libspeex
+- 4:2:2 H.264 decoding support
 
 
 version 0.7:
diff --git a/libavcodec/arm/h264dsp_init_arm.c b/libavcodec/arm/h264dsp_init_arm.c
index c2399e50ff..c1ca217add 100644
--- a/libavcodec/arm/h264dsp_init_arm.c
+++ b/libavcodec/arm/h264dsp_init_arm.c
@@ -92,7 +92,7 @@ void ff_h264_idct8_add4_neon(uint8_t *dst, const int *block_offset,
                              DCTELEM *block, int stride,
                              const uint8_t nnzc[6*8]);
 
-static void ff_h264dsp_init_neon(H264DSPContext *c, const int bit_depth)
+static void ff_h264dsp_init_neon(H264DSPContext *c, const int bit_depth, const int chroma_format_idc)
 {
     if (bit_depth == 8) {
     c->h264_v_loop_filter_luma   = ff_h264_v_loop_filter_luma_neon;
@@ -122,14 +122,15 @@ static void ff_h264dsp_init_neon(H264DSPContext *c, const int bit_depth)
     c->h264_idct_dc_add     = ff_h264_idct_dc_add_neon;
     c->h264_idct_add16      = ff_h264_idct_add16_neon;
     c->h264_idct_add16intra = ff_h264_idct_add16intra_neon;
-    c->h264_idct_add8       = ff_h264_idct_add8_neon;
+    if (chroma_format_idc == 1)
+        c->h264_idct_add8   = ff_h264_idct_add8_neon;
     c->h264_idct8_add       = ff_h264_idct8_add_neon;
     c->h264_idct8_dc_add    = ff_h264_idct8_dc_add_neon;
     c->h264_idct8_add4      = ff_h264_idct8_add4_neon;
     }
 }
 
-void ff_h264dsp_init_arm(H264DSPContext *c, const int bit_depth)
+void ff_h264dsp_init_arm(H264DSPContext *c, const int bit_depth, const int chroma_format_idc)
 {
-    if (HAVE_NEON) ff_h264dsp_init_neon(c, bit_depth);
+    if (HAVE_NEON) ff_h264dsp_init_neon(c, bit_depth, chroma_format_idc);
 }
diff --git a/libavcodec/arm/h264pred_init_arm.c b/libavcodec/arm/h264pred_init_arm.c
index e96f339a55..5fc07bc137 100644
--- a/libavcodec/arm/h264pred_init_arm.c
+++ b/libavcodec/arm/h264pred_init_arm.c
@@ -42,7 +42,7 @@ void ff_pred8x8_0lt_dc_neon(uint8_t *src, int stride);
 void ff_pred8x8_l00_dc_neon(uint8_t *src, int stride);
 void ff_pred8x8_0l0_dc_neon(uint8_t *src, int stride);
 
-static void ff_h264_pred_init_neon(H264PredContext *h, int codec_id, const int bit_depth)
+static void ff_h264_pred_init_neon(H264PredContext *h, int codec_id, const int bit_depth, const int chroma_format_idc)
 {
     const int high_depth = bit_depth > 8;
 
@@ -74,7 +74,7 @@ static void ff_h264_pred_init_neon(H264PredContext *h, int codec_id, const int b
         h->pred16x16[PLANE_PRED8x8  ] = ff_pred16x16_plane_neon;
 }
 
-void ff_h264_pred_init_arm(H264PredContext *h, int codec_id, int bit_depth)
+void ff_h264_pred_init_arm(H264PredContext *h, int codec_id, int bit_depth, const int chroma_format_idc)
 {
-    if (HAVE_NEON)    ff_h264_pred_init_neon(h, codec_id, bit_depth);
+    if (HAVE_NEON)    ff_h264_pred_init_neon(h, codec_id, bit_depth, chroma_format_idc);
 }
diff --git a/libavcodec/dsputil.h b/libavcodec/dsputil.h
index bef2cdd4e8..acb2041460 100644
--- a/libavcodec/dsputil.h
+++ b/libavcodec/dsputil.h
@@ -63,8 +63,10 @@ void ff_h264_idct_dc_add_ ## depth ## _c(uint8_t *dst, DCTELEM *block, int strid
 void ff_h264_idct_add16_ ## depth ## _c(uint8_t *dst, const int *blockoffset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]);\
 void ff_h264_idct_add16intra_ ## depth ## _c(uint8_t *dst, const int *blockoffset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]);\
 void ff_h264_idct8_add4_ ## depth ## _c(uint8_t *dst, const int *blockoffset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]);\
+void ff_h264_idct_add8_422_ ## depth ## _c(uint8_t **dest, const int *blockoffset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]);\
 void ff_h264_idct_add8_ ## depth ## _c(uint8_t **dest, const int *blockoffset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]);\
 void ff_h264_luma_dc_dequant_idct_ ## depth ## _c(DCTELEM *output, DCTELEM *input, int qmul);\
+void ff_h264_chroma422_dc_dequant_idct_ ## depth ## _c(DCTELEM *block, int qmul);\
 void ff_h264_chroma_dc_dequant_idct_ ## depth ## _c(DCTELEM *block, int qmul);
 
 H264_IDCT( 8)
diff --git a/libavcodec/h264.c b/libavcodec/h264.c
index 1faaaa6802..f61f524508 100644
--- a/libavcodec/h264.c
+++ b/libavcodec/h264.c
@@ -942,7 +942,7 @@ static void clone_tables(H264Context *dst, H264Context *src, int i){
     dst->list_counts              = src->list_counts;
 
     dst->s.obmc_scratchpad = NULL;
-    ff_h264_pred_init(&dst->hpc, src->s.codec_id, src->sps.bit_depth_luma);
+    ff_h264_pred_init(&dst->hpc, src->s.codec_id, src->sps.bit_depth_luma, src->sps.chroma_format_idc);
 }
 
 /**
@@ -970,8 +970,8 @@ static av_cold void common_init(H264Context *h){
     s->height = s->avctx->height;
     s->codec_id= s->avctx->codec->id;
 
-    ff_h264dsp_init(&h->h264dsp, 8);
-    ff_h264_pred_init(&h->hpc, s->codec_id, 8);
+    ff_h264dsp_init(&h->h264dsp, 8, 1);
+    ff_h264_pred_init(&h->hpc, s->codec_id, 8, 1);
 
     h->dequant_coeff_pps= -1;
     s->unrestricted_mv=1;
@@ -1432,11 +1432,16 @@ static void decode_postinit(H264Context *h, int setup_finished){
         ff_thread_finish_setup(s->avctx);
 }
 
-static av_always_inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int chroma444, int simple){
+static av_always_inline void backup_mb_border(H264Context *h, uint8_t *src_y,
+                                              uint8_t *src_cb, uint8_t *src_cr,
+                                              int linesize, int uvlinesize, int simple)
+{
     MpegEncContext * const s = &h->s;
     uint8_t *top_border;
     int top_idx = 1;
     const int pixel_shift = h->pixel_shift;
+    int chroma444 = CHROMA444;
+    int chroma422 = CHROMA422;
 
     src_y  -=   linesize;
     src_cb -= uvlinesize;
@@ -1460,6 +1465,14 @@ static av_always_inline void backup_mb_border(H264Context *h, uint8_t *src_y, ui
                             AV_COPY128(top_border+16, src_cb + 15*uvlinesize);
                             AV_COPY128(top_border+32, src_cr + 15*uvlinesize);
                         }
+                    } else if(chroma422) {
+                        if (pixel_shift) {
+                            AV_COPY128(top_border+32, src_cb + 15*uvlinesize);
+                            AV_COPY128(top_border+48, src_cr + 15*uvlinesize);
+                        } else {
+                            AV_COPY64(top_border+16, src_cb +  15*uvlinesize);
+                            AV_COPY64(top_border+24, src_cr +  15*uvlinesize);
+                        }
                     } else {
                         if (pixel_shift) {
                             AV_COPY128(top_border+32, src_cb+7*uvlinesize);
@@ -1495,6 +1508,14 @@ static av_always_inline void backup_mb_border(H264Context *h, uint8_t *src_y, ui
                 AV_COPY128(top_border+16, src_cb + 16*linesize);
                 AV_COPY128(top_border+32, src_cr + 16*linesize);
             }
+        } else if(chroma422) {
+            if (pixel_shift) {
+                AV_COPY128(top_border+32, src_cb+16*uvlinesize);
+                AV_COPY128(top_border+48, src_cr+16*uvlinesize);
+            } else {
+                AV_COPY64(top_border+16, src_cb+16*uvlinesize);
+                AV_COPY64(top_border+24, src_cr+16*uvlinesize);
+            }
         } else {
             if (pixel_shift) {
                 AV_COPY128(top_border+32, src_cb+8*uvlinesize);
@@ -1773,10 +1794,11 @@ static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple, i
     /* is_h264 should always be true if SVQ3 is disabled. */
     const int is_h264 = !CONFIG_SVQ3_DECODER || simple || s->codec_id == CODEC_ID_H264;
     void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride);
+    const int block_h = 16 >> s->chroma_y_shift;
 
     dest_y  = s->current_picture.f.data[0] + ((mb_x << pixel_shift) + mb_y * s->linesize  ) * 16;
-    dest_cb = s->current_picture.f.data[1] + ((mb_x << pixel_shift) + mb_y * s->uvlinesize) *  8;
-    dest_cr = s->current_picture.f.data[2] + ((mb_x << pixel_shift) + mb_y * s->uvlinesize) *  8;
+    dest_cb = s->current_picture.f.data[1] + (mb_x << pixel_shift)*8 + mb_y * s->uvlinesize * block_h;
+    dest_cr = s->current_picture.f.data[2] + (mb_x << pixel_shift)*8 + mb_y * s->uvlinesize * block_h;
 
     s->dsp.prefetch(dest_y + (s->mb_x&3)*4*s->linesize + (64 << pixel_shift), s->linesize, 4);
     s->dsp.prefetch(dest_cb + (s->mb_x&7)*s->uvlinesize + (64 << pixel_shift), dest_cr - dest_cb, 2);
@@ -1789,8 +1811,8 @@ static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple, i
         block_offset = &h->block_offset[48];
         if(mb_y&1){ //FIXME move out of this function?
             dest_y -= s->linesize*15;
-            dest_cb-= s->uvlinesize*7;
-            dest_cr-= s->uvlinesize*7;
+            dest_cb-= s->uvlinesize * (block_h - 1);
+            dest_cr-= s->uvlinesize * (block_h - 1);
         }
         if(FRAME_MBAFF) {
             int list;
@@ -1842,12 +1864,12 @@ static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple, i
                         }
                     }
                 } else {
-                    for (i = 0; i < 8; i++) {
+                    for (i = 0; i < block_h; i++) {
                         uint16_t *tmp_cb = (uint16_t*)(dest_cb + i*uvlinesize);
                         for (j = 0; j < 8; j++)
                             tmp_cb[j] = get_bits(&gb, bit_depth);
                     }
-                    for (i = 0; i < 8; i++) {
+                    for (i = 0; i < block_h; i++) {
                         uint16_t *tmp_cr = (uint16_t*)(dest_cr + i*uvlinesize);
                         for (j = 0; j < 8; j++)
                             tmp_cr[j] = get_bits(&gb, bit_depth);
@@ -1865,7 +1887,7 @@ static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple, i
                         memset(dest_cr + i*uvlinesize, 128, 8);
                     }
                 } else {
-                    for (i = 0; i < 8; i++) {
+                    for (i = 0; i < block_h; i++) {
                         memcpy(dest_cb + i*uvlinesize, h->mb + 128 + i*4,  8);
                         memcpy(dest_cr + i*uvlinesize, h->mb + 160 + i*4,  8);
                     }
@@ -1913,10 +1935,18 @@ static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple, i
                 }
             }else{
                 if(is_h264){
+                    int qp[2];
+                    if (CHROMA422) {
+                        qp[0] = h->chroma_qp[0] + 3;
+                        qp[1] = h->chroma_qp[1] + 3;
+                    } else {
+                        qp[0] = h->chroma_qp[0];
+                        qp[1] = h->chroma_qp[1];
+                    }
                     if(h->non_zero_count_cache[ scan8[CHROMA_DC_BLOCK_INDEX+0] ])
-                        h->h264dsp.h264_chroma_dc_dequant_idct(h->mb + (16*16*1 << pixel_shift), h->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][h->chroma_qp[0]][0]);
+                        h->h264dsp.h264_chroma_dc_dequant_idct(h->mb + (16*16*1 << pixel_shift), h->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][qp[0]][0]);
                     if(h->non_zero_count_cache[ scan8[CHROMA_DC_BLOCK_INDEX+1] ])
-                        h->h264dsp.h264_chroma_dc_dequant_idct(h->mb + (16*16*2 << pixel_shift), h->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][h->chroma_qp[1]][0]);
+                        h->h264dsp.h264_chroma_dc_dequant_idct(h->mb + (16*16*2 << pixel_shift), h->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][qp[1]][0]);
                     h->h264dsp.h264_idct_add8(dest, block_offset,
                                               h->mb, uvlinesize,
                                               h->non_zero_count_cache);
@@ -2555,11 +2585,13 @@ static int decode_slice_header(H264Context *h, H264Context *h0){
 
     h->b_stride=  s->mb_width*4;
 
+    s->chroma_y_shift = h->sps.chroma_format_idc <= 1; // 400 uses yuv420p
+
     s->width = 16*s->mb_width - (2>>CHROMA444)*FFMIN(h->sps.crop_right, (8<<CHROMA444)-1);
     if(h->sps.frame_mbs_only_flag)
-        s->height= 16*s->mb_height - (2>>CHROMA444)*FFMIN(h->sps.crop_bottom, (8<<CHROMA444)-1);
+        s->height= 16*s->mb_height - (1<<s->chroma_y_shift)*FFMIN(h->sps.crop_bottom, (16>>s->chroma_y_shift)-1);
     else
-        s->height= 16*s->mb_height - (4>>CHROMA444)*FFMIN(h->sps.crop_bottom, (8<<CHROMA444)-1);
+        s->height= 16*s->mb_height - (2<<s->chroma_y_shift)*FFMIN(h->sps.crop_bottom, (16>>s->chroma_y_shift)-1);
 
     if (s->context_initialized
         && (   s->width != s->avctx->width || s->height != s->avctx->height
@@ -2601,14 +2633,26 @@ static int decode_slice_header(H264Context *h, H264Context *h0){
 
         switch (h->sps.bit_depth_luma) {
             case 9 :
-                s->avctx->pix_fmt = CHROMA444 ? PIX_FMT_YUV444P9 : PIX_FMT_YUV420P9;
+                if (CHROMA444)
+                    s->avctx->pix_fmt = PIX_FMT_YUV444P9;
+                else if (CHROMA422)
+                    s->avctx->pix_fmt = PIX_FMT_YUV422P9;
+                else
+                    s->avctx->pix_fmt = PIX_FMT_YUV420P9;
                 break;
             case 10 :
-                s->avctx->pix_fmt = CHROMA444 ? PIX_FMT_YUV444P10 : PIX_FMT_YUV420P10;
+                if (CHROMA444)
+                    s->avctx->pix_fmt = PIX_FMT_YUV444P10;
+                else if (CHROMA422)
+                    s->avctx->pix_fmt = PIX_FMT_YUV422P10;
+                else
+                    s->avctx->pix_fmt = PIX_FMT_YUV420P10;
                 break;
             default:
                 if (CHROMA444){
                     s->avctx->pix_fmt = s->avctx->color_range == AVCOL_RANGE_JPEG ? PIX_FMT_YUVJ444P : PIX_FMT_YUV444P;
+                } else if (CHROMA422) {
+                    s->avctx->pix_fmt = s->avctx->color_range == AVCOL_RANGE_JPEG ? PIX_FMT_YUVJ422P : PIX_FMT_YUV422P;
                 }else{
                     s->avctx->pix_fmt = s->avctx->get_format(s->avctx,
                                                              s->avctx->codec->pix_fmts ?
@@ -3272,6 +3316,7 @@ static void loop_filter(H264Context *h, int start_x, int end_x){
     const int end_mb_y= s->mb_y + FRAME_MBAFF;
     const int old_slice_type= h->slice_type;
     const int pixel_shift = h->pixel_shift;
+    const int block_h = 16 >> s->chroma_y_shift;
 
     if(h->deblocking_filter) {
         for(mb_x= start_x; mb_x<end_x; mb_x++){
@@ -3288,8 +3333,8 @@ static void loop_filter(H264Context *h, int start_x, int end_x){
                 s->mb_x= mb_x;
                 s->mb_y= mb_y;
                 dest_y  = s->current_picture.f.data[0] + ((mb_x << pixel_shift) + mb_y * s->linesize  ) * 16;
-                dest_cb = s->current_picture.f.data[1] + ((mb_x << pixel_shift) + mb_y * s->uvlinesize) * (8 << CHROMA444);
-                dest_cr = s->current_picture.f.data[2] + ((mb_x << pixel_shift) + mb_y * s->uvlinesize) * (8 << CHROMA444);
+                dest_cb = s->current_picture.f.data[1] + (mb_x << pixel_shift) * (8 << CHROMA444) + mb_y * s->uvlinesize * block_h;
+                dest_cr = s->current_picture.f.data[2] + (mb_x << pixel_shift) * (8 << CHROMA444) + mb_y * s->uvlinesize * block_h;
                     //FIXME simplify above
 
                 if (MB_FIELD) {
@@ -3297,14 +3342,14 @@ static void loop_filter(H264Context *h, int start_x, int end_x){
                     uvlinesize = h->mb_uvlinesize = s->uvlinesize * 2;
                     if(mb_y&1){ //FIXME move out of this function?
                         dest_y -= s->linesize*15;
-                        dest_cb-= s->uvlinesize*((8 << CHROMA444)-1);
-                        dest_cr-= s->uvlinesize*((8 << CHROMA444)-1);
+                        dest_cb-= s->uvlinesize * (block_h - 1);
+                        dest_cr-= s->uvlinesize * (block_h - 1);
                     }
                 } else {
                     linesize   = h->mb_linesize   = s->linesize;
                     uvlinesize = h->mb_uvlinesize = s->uvlinesize;
                 }
-                backup_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, CHROMA444, 0);
+                backup_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0);
                 if(fill_filter_caches(h, mb_type))
                     continue;
                 h->chroma_qp[0] = get_chroma_qp(h, 0, s->current_picture.f.qscale_table[mb_xy]);
@@ -3742,13 +3787,15 @@ static int decode_nal_units(H264Context *h, const uint8_t *buf, int buf_size){
             if(avctx->has_b_frames < 2)
                 avctx->has_b_frames= !s->low_delay;
 
-            if (avctx->bits_per_raw_sample != h->sps.bit_depth_luma) {
+            if (avctx->bits_per_raw_sample != h->sps.bit_depth_luma ||
+                h->cur_chroma_format_idc != h->sps.chroma_format_idc) {
                 if (h->sps.bit_depth_luma >= 8 && h->sps.bit_depth_luma <= 10) {
                     avctx->bits_per_raw_sample = h->sps.bit_depth_luma;
+                    h->cur_chroma_format_idc = h->sps.chroma_format_idc;
                     h->pixel_shift = h->sps.bit_depth_luma > 8;
 
-                    ff_h264dsp_init(&h->h264dsp, h->sps.bit_depth_luma);
-                    ff_h264_pred_init(&h->hpc, s->codec_id, h->sps.bit_depth_luma);
+                    ff_h264dsp_init(&h->h264dsp, h->sps.bit_depth_luma, h->sps.chroma_format_idc);
+                    ff_h264_pred_init(&h->hpc, s->codec_id, h->sps.bit_depth_luma, h->sps.chroma_format_idc);
                     s->dsp.dct_bits = h->sps.bit_depth_luma > 8 ? 32 : 16;
                     dsputil_init(&s->dsp, s->avctx);
                 } else {
diff --git a/libavcodec/h264.h b/libavcodec/h264.h
index 122a54aca0..bd2b5d8fe5 100644
--- a/libavcodec/h264.h
+++ b/libavcodec/h264.h
@@ -39,13 +39,6 @@
 #define interlaced_dct interlaced_dct_is_a_bad_name
 #define mb_intra mb_intra_is_not_initialized_see_mb_type
 
-#define CHROMA_DC_COEFF_TOKEN_VLC_BITS 8
-#define COEFF_TOKEN_VLC_BITS           8
-#define TOTAL_ZEROS_VLC_BITS           9
-#define CHROMA_DC_TOTAL_ZEROS_VLC_BITS 3
-#define RUN_VLC_BITS                   3
-#define RUN7_VLC_BITS                  6
-
 #define MAX_SPS_COUNT 32
 #define MAX_PPS_COUNT 256
 
@@ -92,6 +85,7 @@
 #define CABAC h->pps.cabac
 #endif
 
+#define CHROMA422 (h->sps.chroma_format_idc == 2)
 #define CHROMA444 (h->sps.chroma_format_idc == 3)
 
 #define EXTENDED_SAR          255
@@ -582,6 +576,8 @@ typedef struct H264Context{
     // Timestamp stuff
     int sei_buffering_period_present;  ///< Buffering period SEI flag
     int initial_cpb_removal_delay[32]; ///< Initial timestamps for CPBs
+
+    int cur_chroma_format_idc;
 }H264Context;
 
 
@@ -809,7 +805,7 @@ static av_always_inline void write_back_non_zero_count(H264Context *h){
     AV_COPY32(&nnz[32], &nnz_cache[4+8*11]);
     AV_COPY32(&nnz[36], &nnz_cache[4+8*12]);
 
-    if(CHROMA444){
+    if(!h->s.chroma_y_shift){
         AV_COPY32(&nnz[24], &nnz_cache[4+8* 8]);
         AV_COPY32(&nnz[28], &nnz_cache[4+8* 9]);
         AV_COPY32(&nnz[40], &nnz_cache[4+8*13]);
diff --git a/libavcodec/h264_cabac.c b/libavcodec/h264_cabac.c
index 065b6e85e1..0325ea456f 100644
--- a/libavcodec/h264_cabac.c
+++ b/libavcodec/h264_cabac.c
@@ -1565,7 +1565,12 @@ DECLARE_ASM_CONST(1, uint8_t, last_coeff_flag_offset_8x8)[63] = {
     5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8
 };
 
-static av_always_inline void decode_cabac_residual_internal( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff, int is_dc ) {
+static av_always_inline void
+decode_cabac_residual_internal(H264Context *h, DCTELEM *block,
+                               int cat, int n, const uint8_t *scantable,
+                               const uint32_t *qmul, int max_coeff,
+                               int is_dc, int chroma422)
+{
     static const int significant_coeff_flag_offset[2][14] = {
       { 105+0, 105+15, 105+29, 105+44, 105+47, 402, 484+0, 484+15, 484+29, 660, 528+0, 528+15, 528+29, 718 },
       { 277+0, 277+15, 277+29, 277+44, 277+47, 436, 776+0, 776+15, 776+29, 675, 820+0, 820+15, 820+29, 733 }
@@ -1587,12 +1592,16 @@ static av_always_inline void decode_cabac_residual_internal( H264Context *h, DCT
         9, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,13,13, 9,
         9,10,10, 8,13,13, 9, 9,10,10,14,14,14,14,14 }
     };
+    static const uint8_t sig_coeff_offset_dc[7] = { 0, 0, 1, 1, 2, 2, 2 };
     /* node ctx: 0..3: abslevel1 (with abslevelgt1 == 0).
      * 4..7: abslevelgt1 + 3 (and abslevel1 doesn't matter).
      * map node ctx => cabac ctx for level=1 */
     static const uint8_t coeff_abs_level1_ctx[8] = { 1, 2, 3, 4, 0, 0, 0, 0 };
     /* map node ctx => cabac ctx for level>1 */
-    static const uint8_t coeff_abs_levelgt1_ctx[8] = { 5, 5, 5, 5, 6, 7, 8, 9 };
+    static const uint8_t coeff_abs_levelgt1_ctx[2][8] = {
+        { 5, 5, 5, 5, 6, 7, 8, 9 },
+        { 5, 5, 5, 5, 6, 7, 8, 8 }, // 422/dc case
+    };
     static const uint8_t coeff_abs_level_transition[2][8] = {
     /* update node ctx after decoding a level=1 */
         { 1, 2, 3, 3, 4, 5, 6, 7 },
@@ -1651,12 +1660,20 @@ static av_always_inline void decode_cabac_residual_internal( H264Context *h, DCT
         coeff_count= decode_significance_8x8_x86(CC, significant_coeff_ctx_base, index,
                                                  last_coeff_ctx_base, sig_off);
     } else {
-        coeff_count= decode_significance_x86(CC, max_coeff, significant_coeff_ctx_base, index,
-                                             last_coeff_ctx_base-significant_coeff_ctx_base);
+        if (is_dc && chroma422) { // dc 422
+            DECODE_SIGNIFICANCE(7, sig_coeff_offset_dc[last], sig_coeff_offset_dc[last]);
+        } else {
+            coeff_count= decode_significance_x86(CC, max_coeff, significant_coeff_ctx_base, index,
+                                                 last_coeff_ctx_base-significant_coeff_ctx_base);
+        }
 #else
         DECODE_SIGNIFICANCE( 63, sig_off[last], last_coeff_flag_offset_8x8[last] );
     } else {
-        DECODE_SIGNIFICANCE( max_coeff - 1, last, last );
+        if (is_dc && chroma422) { // dc 422
+            DECODE_SIGNIFICANCE(7, sig_coeff_offset_dc[last], sig_coeff_offset_dc[last]);
+        } else {
+            DECODE_SIGNIFICANCE(max_coeff - 1, last, last);
+        }
 #endif
     }
     assert(coeff_count > 0);
@@ -1691,7 +1708,7 @@ static av_always_inline void decode_cabac_residual_internal( H264Context *h, DCT
             } \
         } else { \
             int coeff_abs = 2; \
-            ctx = coeff_abs_levelgt1_ctx[node_ctx] + abs_level_m1_ctx_base; \
+            ctx = coeff_abs_levelgt1_ctx[is_dc && chroma422][node_ctx] + abs_level_m1_ctx_base; \
             node_ctx = coeff_abs_level_transition[1][node_ctx]; \
 \
             while( coeff_abs < 15 && get_cabac( CC, ctx ) ) { \
@@ -1733,11 +1750,18 @@ static av_always_inline void decode_cabac_residual_internal( H264Context *h, DCT
 }
 
 static void decode_cabac_residual_dc_internal( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, int max_coeff ) {
-    decode_cabac_residual_internal(h, block, cat, n, scantable, NULL, max_coeff, 1);
+    decode_cabac_residual_internal(h, block, cat, n, scantable, NULL, max_coeff, 1, 0);
+}
+
+static void decode_cabac_residual_dc_internal_422(H264Context *h, DCTELEM *block,
+                                                  int cat, int n, const uint8_t *scantable,
+                                                  int max_coeff)
+{
+    decode_cabac_residual_internal(h, block, cat, n, scantable, NULL, max_coeff, 1, 1);
 }
 
 static void decode_cabac_residual_nondc_internal( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
-    decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, 0);
+    decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, 0, 0);
 }
 
 /* cat: 0-> DC 16x16  n = 0
@@ -1761,6 +1785,19 @@ static av_always_inline void decode_cabac_residual_dc( H264Context *h, DCTELEM *
     decode_cabac_residual_dc_internal( h, block, cat, n, scantable, max_coeff );
 }
 
+static av_always_inline void
+decode_cabac_residual_dc_422(H264Context *h, DCTELEM *block,
+                             int cat, int n, const uint8_t *scantable,
+                             int max_coeff)
+{
+    /* read coded block flag */
+    if (get_cabac(&h->cabac, &h->cabac_state[get_cabac_cbf_ctx(h, cat, n, max_coeff, 1)]) == 0) {
+        h->non_zero_count_cache[scan8[n]] = 0;
+        return;
+    }
+    decode_cabac_residual_dc_internal_422(h, block, cat, n, scantable, max_coeff);
+}
+
 static av_always_inline void decode_cabac_residual_nondc( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
     /* read coded block flag */
     if( (cat != 5 || CHROMA444) && get_cabac( &h->cabac, &h->cabac_state[get_cabac_cbf_ctx( h, cat, n, max_coeff, 0 ) ] ) == 0 ) {
@@ -2313,7 +2350,36 @@ decode_intra_mb:
         if(CHROMA444){
             decode_cabac_luma_residual(h, scan, scan8x8, pixel_shift, mb_type, cbp, 1);
             decode_cabac_luma_residual(h, scan, scan8x8, pixel_shift, mb_type, cbp, 2);
-        } else {
+        } else if (CHROMA422) {
+            if( cbp&0x30 ){
+                int c;
+                for( c = 0; c < 2; c++ ) {
+                    //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-DC\n",c );
+                    decode_cabac_residual_dc_422(h, h->mb + ((256 + 16*16*c) << pixel_shift), 3,
+                                                 CHROMA_DC_BLOCK_INDEX + c,
+                                                 chroma422_dc_scan, 8);
+                }
+            }
+
+            if( cbp&0x20 ) {
+                int c, i, i8x8;
+                for( c = 0; c < 2; c++ ) {
+                    DCTELEM *mb = h->mb + (16*(16 + 16*c) << pixel_shift);
+                    qmul = h->dequant4_coeff[c+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[c]];
+                    for (i8x8 = 0; i8x8 < 2; i8x8++) {
+                        for (i = 0; i < 4; i++) {
+                            const int index = 16 + 16 * c + 8*i8x8 + i;
+                            //av_log(s->avctx, AV_LOG_ERROR, "INTRA C%d-AC %d\n",c, index - 16);
+                            decode_cabac_residual_nondc(h, mb, 4, index, scan + 1, qmul, 15);
+                            mb += 16<<pixel_shift;
+                        }
+                    }
+                }
+            } else {
+                fill_rectangle(&h->non_zero_count_cache[scan8[16]], 4, 4, 8, 0, 1);
+                fill_rectangle(&h->non_zero_count_cache[scan8[32]], 4, 4, 8, 0, 1);
+            }
+        } else /* yuv420 */ {
             if( cbp&0x30 ){
                 int c;
                 for( c = 0; c < 2; c++ ) {
diff --git a/libavcodec/h264_cavlc.c b/libavcodec/h264_cavlc.c
index ca7b9399d3..b94b51b9a1 100644
--- a/libavcodec/h264_cavlc.c
+++ b/libavcodec/h264_cavlc.c
@@ -62,6 +62,30 @@ static const uint8_t chroma_dc_coeff_token_bits[4*5]={
  2, 3, 2, 0,
 };
 
+static const uint8_t chroma422_dc_coeff_token_len[4*9]={
+  1,  0,  0,  0,
+  7,  2,  0,  0,
+  7,  7,  3,  0,
+  9,  7,  7,  5,
+  9,  9,  7,  6,
+ 10, 10,  9,  7,
+ 11, 11, 10,  7,
+ 12, 12, 11, 10,
+ 13, 12, 12, 11,
+};
+
+static const uint8_t chroma422_dc_coeff_token_bits[4*9]={
+  1,   0,  0, 0,
+ 15,   1,  0, 0,
+ 14,  13,  1, 0,
+  7,  12, 11, 1,
+  6,   5, 10, 1,
+  7,   6,  4, 9,
+  7,   6,  5, 8,
+  7,   6,  5, 4,
+  7,   5,  4, 4,
+};
+
 static const uint8_t coeff_token_len[4][4*17]={
 {
      1, 0, 0, 0,
@@ -172,6 +196,26 @@ static const uint8_t chroma_dc_total_zeros_bits[3][4]= {
     { 1, 0, 0, 0,},
 };
 
+static const uint8_t chroma422_dc_total_zeros_len[7][8]= {
+    { 1, 3, 3, 4, 4, 4, 5, 5 },
+    { 3, 2, 3, 3, 3, 3, 3 },
+    { 3, 3, 2, 2, 3, 3 },
+    { 3, 2, 2, 2, 3 },
+    { 2, 2, 2, 2 },
+    { 2, 2, 1 },
+    { 1, 1 },
+};
+
+static const uint8_t chroma422_dc_total_zeros_bits[7][8]= {
+    { 1, 2, 3, 2, 3, 1, 1, 0 },
+    { 0, 1, 1, 4, 5, 6, 7 },
+    { 0, 1, 1, 2, 6, 7 },
+    { 6, 0, 1, 2, 7 },
+    { 0, 1, 2, 3 },
+    { 0, 1, 1 },
+    { 0, 1 },
+};
+
 static const uint8_t run_len[7][16]={
     {1,1},
     {1,2,2},
@@ -200,6 +244,10 @@ static VLC chroma_dc_coeff_token_vlc;
 static VLC_TYPE chroma_dc_coeff_token_vlc_table[256][2];
 static const int chroma_dc_coeff_token_vlc_table_size = 256;
 
+static VLC chroma422_dc_coeff_token_vlc;
+static VLC_TYPE chroma422_dc_coeff_token_vlc_table[8192][2];
+static const int chroma422_dc_coeff_token_vlc_table_size = 8192;
+
 static VLC total_zeros_vlc[15];
 static VLC_TYPE total_zeros_vlc_tables[15][512][2];
 static const int total_zeros_vlc_tables_size = 512;
@@ -208,6 +256,10 @@ static VLC chroma_dc_total_zeros_vlc[3];
 static VLC_TYPE chroma_dc_total_zeros_vlc_tables[3][8][2];
 static const int chroma_dc_total_zeros_vlc_tables_size = 8;
 
+static VLC chroma422_dc_total_zeros_vlc[7];
+static VLC_TYPE chroma422_dc_total_zeros_vlc_tables[7][32][2];
+static const int chroma422_dc_total_zeros_vlc_tables_size = 32;
+
 static VLC run_vlc[6];
 static VLC_TYPE run_vlc_tables[6][8][2];
 static const int run_vlc_tables_size = 8;
@@ -219,6 +271,14 @@ static const int run7_vlc_table_size = 96;
 #define LEVEL_TAB_BITS 8
 static int8_t cavlc_level_tab[7][1<<LEVEL_TAB_BITS][2];
 
+#define CHROMA_DC_COEFF_TOKEN_VLC_BITS 8
+#define CHROMA422_DC_COEFF_TOKEN_VLC_BITS 13
+#define COEFF_TOKEN_VLC_BITS           8
+#define TOTAL_ZEROS_VLC_BITS           9
+#define CHROMA_DC_TOTAL_ZEROS_VLC_BITS 3
+#define CHROMA422_DC_TOTAL_ZEROS_VLC_BITS 5
+#define RUN_VLC_BITS                   3
+#define RUN7_VLC_BITS                  6
 
 /**
  * gets the predicted number of non-zero coefficients.
@@ -278,6 +338,13 @@ av_cold void ff_h264_decode_init_vlc(void){
                  &chroma_dc_coeff_token_bits[0], 1, 1,
                  INIT_VLC_USE_NEW_STATIC);
 
+        chroma422_dc_coeff_token_vlc.table = chroma422_dc_coeff_token_vlc_table;
+        chroma422_dc_coeff_token_vlc.table_allocated = chroma422_dc_coeff_token_vlc_table_size;
+        init_vlc(&chroma422_dc_coeff_token_vlc, CHROMA422_DC_COEFF_TOKEN_VLC_BITS, 4*9,
+                 &chroma422_dc_coeff_token_len [0], 1, 1,
+                 &chroma422_dc_coeff_token_bits[0], 1, 1,
+                 INIT_VLC_USE_NEW_STATIC);
+
         offset = 0;
         for(i=0; i<4; i++){
             coeff_token_vlc[i].table = coeff_token_vlc_tables+offset;
@@ -304,6 +371,17 @@ av_cold void ff_h264_decode_init_vlc(void){
                      &chroma_dc_total_zeros_bits[i][0], 1, 1,
                      INIT_VLC_USE_NEW_STATIC);
         }
+
+        for(i=0; i<7; i++){
+            chroma422_dc_total_zeros_vlc[i].table = chroma422_dc_total_zeros_vlc_tables[i];
+            chroma422_dc_total_zeros_vlc[i].table_allocated = chroma422_dc_total_zeros_vlc_tables_size;
+            init_vlc(&chroma422_dc_total_zeros_vlc[i],
+                     CHROMA422_DC_TOTAL_ZEROS_VLC_BITS, 8,
+                     &chroma422_dc_total_zeros_len [i][0], 1, 1,
+                     &chroma422_dc_total_zeros_bits[i][0], 1, 1,
+                     INIT_VLC_USE_NEW_STATIC);
+        }
+
         for(i=0; i<15; i++){
             total_zeros_vlc[i].table = total_zeros_vlc_tables[i];
             total_zeros_vlc[i].table_allocated = total_zeros_vlc_tables_size;
@@ -373,7 +451,10 @@ static int decode_residual(H264Context *h, GetBitContext *gb, DCTELEM *block, in
     //FIXME put trailing_onex into the context
 
     if(max_coeff <= 8){
-        coeff_token= get_vlc2(gb, chroma_dc_coeff_token_vlc.table, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 1);
+        if (max_coeff == 4)
+            coeff_token = get_vlc2(gb, chroma_dc_coeff_token_vlc.table, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 1);
+        else
+            coeff_token = get_vlc2(gb, chroma422_dc_coeff_token_vlc.table, CHROMA422_DC_COEFF_TOKEN_VLC_BITS, 1);
         total_coeff= coeff_token>>2;
     }else{
         if(n >= LUMA_DC_BLOCK_INDEX){
@@ -483,11 +564,16 @@ static int decode_residual(H264Context *h, GetBitContext *gb, DCTELEM *block, in
     if(total_coeff == max_coeff)
         zeros_left=0;
     else{
-        /* FIXME: we don't actually support 4:2:2 yet. */
-        if(max_coeff <= 8)
-            zeros_left= get_vlc2(gb, (chroma_dc_total_zeros_vlc-1)[ total_coeff ].table, CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 1);
-        else
+        if (max_coeff <= 8) {
+            if (max_coeff == 4)
+                zeros_left = get_vlc2(gb, (chroma_dc_total_zeros_vlc-1)[total_coeff].table,
+                                      CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 1);
+            else
+                zeros_left = get_vlc2(gb, (chroma422_dc_total_zeros_vlc-1)[total_coeff].table,
+                                      CHROMA422_DC_TOTAL_ZEROS_VLC_BITS, 1);
+        } else {
             zeros_left= get_vlc2(gb, (total_zeros_vlc-1)[ total_coeff ].table, TOTAL_ZEROS_VLC_BITS, 1);
+        }
     }
 
 #define STORE_BLOCK(type) \
@@ -994,7 +1080,7 @@ decode_intra_mb:
     s->current_picture.f.mb_type[mb_xy] = mb_type;
 
     if(cbp || IS_INTRA16x16(mb_type)){
-        int i4x4, chroma_idx;
+        int i4x4, i8x8, chroma_idx;
         int dquant;
         int ret;
         GetBitContext *gb= IS_INTRA(mb_type) ? h->intra_gb_ptr : h->inter_gb_ptr;
@@ -1036,7 +1122,34 @@ decode_intra_mb:
             if( decode_luma_residual(h, gb, scan, scan8x8, pixel_shift, mb_type, cbp, 2) < 0 ){
                 return -1;
             }
-        } else {
+        } else if (CHROMA422) {
+            if(cbp&0x30){
+                for(chroma_idx=0; chroma_idx<2; chroma_idx++)
+                    if (decode_residual(h, gb, h->mb + ((256 + 16*16*chroma_idx) << pixel_shift),
+                                        CHROMA_DC_BLOCK_INDEX+chroma_idx, chroma422_dc_scan,
+                                        NULL, 8) < 0) {
+                        return -1;
+                    }
+            }
+
+            if(cbp&0x20){
+                for(chroma_idx=0; chroma_idx<2; chroma_idx++){
+                    const uint32_t *qmul = h->dequant4_coeff[chroma_idx+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[chroma_idx]];
+                    DCTELEM *mb = h->mb + (16*(16 + 16*chroma_idx) << pixel_shift);
+                    for (i8x8 = 0; i8x8 < 2; i8x8++) {
+                        for (i4x4 = 0; i4x4 < 4; i4x4++) {
+                            const int index = 16 + 16*chroma_idx + 8*i8x8 + i4x4;
+                            if (decode_residual(h, gb, mb, index, scan + 1, qmul, 15) < 0)
+                                return -1;
+                            mb += 16 << pixel_shift;
+                        }
+                    }
+                }
+            }else{
+                fill_rectangle(&h->non_zero_count_cache[scan8[16]], 4, 4, 8, 0, 1);
+                fill_rectangle(&h->non_zero_count_cache[scan8[32]], 4, 4, 8, 0, 1);
+            }
+        } else /* yuv420 */ {
             if(cbp&0x30){
                 for(chroma_idx=0; chroma_idx<2; chroma_idx++)
                     if( decode_residual(h, gb, h->mb + ((256 + 16*16*chroma_idx) << pixel_shift), CHROMA_DC_BLOCK_INDEX+chroma_idx, chroma_dc_scan, NULL, 4) < 0){
diff --git a/libavcodec/h264_loopfilter.c b/libavcodec/h264_loopfilter.c
index 377968fcd2..64b07e91f0 100644
--- a/libavcodec/h264_loopfilter.c
+++ b/libavcodec/h264_loopfilter.c
@@ -212,6 +212,7 @@ static void av_always_inline h264_filter_mb_fast_internal( H264Context *h, int m
     MpegEncContext * const s = &h->s;
     int chroma = !(CONFIG_GRAY && (s->flags&CODEC_FLAG_GRAY));
     int chroma444 = CHROMA444;
+    int chroma422 = CHROMA422;
 
     int mb_xy = h->mb_xy;
     int left_type= h->left_type[LTOP];
@@ -289,6 +290,23 @@ static void av_always_inline h264_filter_mb_fast_internal( H264Context *h, int m
                     filter_mb_edgeh( &img_cb[4*3*linesize], linesize, bS3, qpc, a, b, h, 0);
                     filter_mb_edgeh( &img_cr[4*3*linesize], linesize, bS3, qpc, a, b, h, 0);
                 }
+            }else if(chroma422){
+                if(left_type){
+                    filter_mb_edgecv(&img_cb[2*0<<pixel_shift], uvlinesize, bS4, qpc0, a, b, h, 1);
+                    filter_mb_edgecv(&img_cr[2*0<<pixel_shift], uvlinesize, bS4, qpc0, a, b, h, 1);
+                }
+                filter_mb_edgecv(&img_cb[2*2<<pixel_shift], uvlinesize, bS3, qpc, a, b, h, 0);
+                filter_mb_edgecv(&img_cr[2*2<<pixel_shift], uvlinesize, bS3, qpc, a, b, h, 0);
+                if(top_type){
+                    filter_mb_edgech(&img_cb[4*0*uvlinesize], uvlinesize, bSH, qpc1, a, b, h, 1);
+                    filter_mb_edgech(&img_cr[4*0*uvlinesize], uvlinesize, bSH, qpc1, a, b, h, 1);
+                }
+                filter_mb_edgech(&img_cb[4*1*uvlinesize], uvlinesize, bS3, qpc, a, b, h, 0);
+                filter_mb_edgech(&img_cr[4*1*uvlinesize], uvlinesize, bS3, qpc, a, b, h, 0);
+                filter_mb_edgech(&img_cb[4*2*uvlinesize], uvlinesize, bS3, qpc, a, b, h, 0);
+                filter_mb_edgech(&img_cr[4*2*uvlinesize], uvlinesize, bS3, qpc, a, b, h, 0);
+                filter_mb_edgech(&img_cb[4*3*uvlinesize], uvlinesize, bS3, qpc, a, b, h, 0);
+                filter_mb_edgech(&img_cr[4*3*uvlinesize], uvlinesize, bS3, qpc, a, b, h, 0);
             }else{
                 if(left_type){
                     filter_mb_edgecv( &img_cb[2*0<<pixel_shift], uvlinesize, bS4, qpc0, a, b, h, 1);
@@ -411,10 +429,12 @@ static int check_mv(H264Context *h, long b_idx, long bn_idx, int mvy_limit){
     return v;
 }
 
-static av_always_inline void filter_mb_dir(H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize, int mb_xy, int mb_type, int mvy_limit, int first_vertical_edge_done, int a, int b, int chroma, int chroma444, int dir) {
+static av_always_inline void filter_mb_dir(H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize, int mb_xy, int mb_type, int mvy_limit, int first_vertical_edge_done, int a, int b, int chroma, int dir) {
     MpegEncContext * const s = &h->s;
     int edge;
     int chroma_qp_avg[2];
+    int chroma444 = CHROMA444;
+    int chroma422 = CHROMA422;
     const int mbm_xy = dir == 0 ? mb_xy -1 : h->top_mb_xy;
     const int mbm_type = dir == 0 ? h->left_type[LTOP] : h->top_type;
 
@@ -564,8 +584,9 @@ static av_always_inline void filter_mb_dir(H264Context *h, int mb_x, int mb_y, u
     for( edge = 1; edge < edges; edge++ ) {
         DECLARE_ALIGNED(8, int16_t, bS)[4];
         int qp;
+        const int deblock_edge = !IS_8x8DCT(mb_type & (edge<<24)); // (edge&1) && IS_8x8DCT(mb_type)
 
-        if( IS_8x8DCT(mb_type & (edge<<24)) ) // (edge&1) && IS_8x8DCT(mb_type)
+        if (!deblock_edge && (!chroma422 || dir == 0))
             continue;
 
         if( IS_INTRA(mb_type)) {
@@ -627,14 +648,23 @@ static av_always_inline void filter_mb_dir(H264Context *h, int mb_x, int mb_y, u
                 }
             }
         } else {
-            filter_mb_edgeh( &img_y[4*edge*linesize], linesize, bS, qp, a, b, h, 0 );
-            if (chroma) {
-                if (chroma444) {
-                    filter_mb_edgeh ( &img_cb[4*edge*uvlinesize], uvlinesize, bS, h->chroma_qp[0], a, b, h, 0);
-                    filter_mb_edgeh ( &img_cr[4*edge*uvlinesize], uvlinesize, bS, h->chroma_qp[1], a, b, h, 0);
-                } else if( (edge&1) == 0 ) {
-                    filter_mb_edgech( &img_cb[2*edge*uvlinesize], uvlinesize, bS, h->chroma_qp[0], a, b, h, 0);
-                    filter_mb_edgech( &img_cr[2*edge*uvlinesize], uvlinesize, bS, h->chroma_qp[1], a, b, h, 0);
+            if (chroma422) {
+                if (deblock_edge)
+                    filter_mb_edgeh(&img_y[4*edge*linesize], linesize, bS, qp, a, b, h, 0);
+                if (chroma) {
+                    filter_mb_edgech(&img_cb[4*edge*uvlinesize], uvlinesize, bS, h->chroma_qp[0], a, b, h, 0);
+                    filter_mb_edgech(&img_cr[4*edge*uvlinesize], uvlinesize, bS, h->chroma_qp[1], a, b, h, 0);
+                }
+            } else {
+                filter_mb_edgeh(&img_y[4*edge*linesize], linesize, bS, qp, a, b, h, 0);
+                if (chroma) {
+                    if (chroma444) {
+                        filter_mb_edgeh (&img_cb[4*edge*uvlinesize], uvlinesize, bS, h->chroma_qp[0], a, b, h, 0);
+                        filter_mb_edgeh (&img_cr[4*edge*uvlinesize], uvlinesize, bS, h->chroma_qp[1], a, b, h, 0);
+                    } else if ((edge&1) == 0) {
+                        filter_mb_edgech(&img_cb[2*edge*uvlinesize], uvlinesize, bS, h->chroma_qp[0], a, b, h, 0);
+                        filter_mb_edgech(&img_cr[2*edge*uvlinesize], uvlinesize, bS, h->chroma_qp[1], a, b, h, 0);
+                    }
                 }
             }
         }
@@ -726,6 +756,11 @@ void ff_h264_filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint
                     filter_mb_mbaff_edgev ( h, img_cb + 8*uvlinesize, uvlinesize, bS+4, 1, bqp[1], a, b, 1 );
                     filter_mb_mbaff_edgev ( h, img_cr,                uvlinesize, bS  , 1, rqp[0], a, b, 1 );
                     filter_mb_mbaff_edgev ( h, img_cr + 8*uvlinesize, uvlinesize, bS+4, 1, rqp[1], a, b, 1 );
+                } else if (CHROMA422) {
+                    filter_mb_mbaff_edgecv(h, img_cb,                uvlinesize, bS  , 1, bqp[0], a, b, 1);
+                    filter_mb_mbaff_edgecv(h, img_cb + 8*uvlinesize, uvlinesize, bS+4, 1, bqp[1], a, b, 1);
+                    filter_mb_mbaff_edgecv(h, img_cr,                uvlinesize, bS  , 1, rqp[0], a, b, 1);
+                    filter_mb_mbaff_edgecv(h, img_cr + 8*uvlinesize, uvlinesize, bS+4, 1, rqp[1], a, b, 1);
                 }else{
                     filter_mb_mbaff_edgecv( h, img_cb,                uvlinesize, bS  , 1, bqp[0], a, b, 1 );
                     filter_mb_mbaff_edgecv( h, img_cb + 4*uvlinesize, uvlinesize, bS+4, 1, bqp[1], a, b, 1 );
@@ -754,9 +789,9 @@ void ff_h264_filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint
 
 #if CONFIG_SMALL
     for( dir = 0; dir < 2; dir++ )
-        filter_mb_dir(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize, mb_xy, mb_type, mvy_limit, dir ? 0 : first_vertical_edge_done, a, b, chroma, CHROMA444, dir);
+        filter_mb_dir(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize, mb_xy, mb_type, mvy_limit, dir ? 0 : first_vertical_edge_done, a, b, chroma, dir);
 #else
-    filter_mb_dir(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize, mb_xy, mb_type, mvy_limit, first_vertical_edge_done, a, b, chroma, CHROMA444, 0);
-    filter_mb_dir(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize, mb_xy, mb_type, mvy_limit, 0,                        a, b, chroma, CHROMA444, 1);
+    filter_mb_dir(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize, mb_xy, mb_type, mvy_limit, first_vertical_edge_done, a, b, chroma, 0);
+    filter_mb_dir(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize, mb_xy, mb_type, mvy_limit, 0,                        a, b, chroma, 1);
 #endif
 }
diff --git a/libavcodec/h264_mvpred.h b/libavcodec/h264_mvpred.h
index 7c7086d440..4b6a083bb4 100644
--- a/libavcodec/h264_mvpred.h
+++ b/libavcodec/h264_mvpred.h
@@ -510,7 +510,7 @@ static void fill_decode_caches(H264Context *h, int mb_type){
     if(top_type){
         nnz = h->non_zero_count[top_xy];
         AV_COPY32(&nnz_cache[4+8* 0], &nnz[4*3]);
-        if(CHROMA444){
+        if(!s->chroma_y_shift){
             AV_COPY32(&nnz_cache[4+8* 5], &nnz[4* 7]);
             AV_COPY32(&nnz_cache[4+8*10], &nnz[4*11]);
         }else{
@@ -534,6 +534,11 @@ static void fill_decode_caches(H264Context *h, int mb_type){
                 nnz_cache[3+8* 7 + 2*8*i]= nnz[left_block[8+1+2*i]+4*4];
                 nnz_cache[3+8*11 + 2*8*i]= nnz[left_block[8+0+2*i]+8*4];
                 nnz_cache[3+8*12 + 2*8*i]= nnz[left_block[8+1+2*i]+8*4];
+            }else if(CHROMA422) {
+                nnz_cache[3+8* 6 + 2*8*i]= nnz[left_block[8+0+2*i]-2+4*4];
+                nnz_cache[3+8* 7 + 2*8*i]= nnz[left_block[8+1+2*i]-2+4*4];
+                nnz_cache[3+8*11 + 2*8*i]= nnz[left_block[8+0+2*i]-2+8*4];
+                nnz_cache[3+8*12 + 2*8*i]= nnz[left_block[8+1+2*i]-2+8*4];
             }else{
                 nnz_cache[3+8* 6 +   8*i]= nnz[left_block[8+4+2*i]];
                 nnz_cache[3+8*11 +   8*i]= nnz[left_block[8+5+2*i]];
diff --git a/libavcodec/h264_ps.c b/libavcodec/h264_ps.c
index 677ca80abb..76bf116a3f 100644
--- a/libavcodec/h264_ps.c
+++ b/libavcodec/h264_ps.c
@@ -396,7 +396,8 @@ int ff_h264_decode_seq_parameter_set(H264Context *h){
 #endif
     sps->crop= get_bits1(&s->gb);
     if(sps->crop){
-        int crop_limit = sps->chroma_format_idc == 3 ? 16 : 8;
+        int crop_vertical_limit   = sps->chroma_format_idc  & 2 ? 16 : 8;
+        int crop_horizontal_limit = sps->chroma_format_idc == 3 ? 16 : 8;
         sps->crop_left  = get_ue_golomb(&s->gb);
         sps->crop_right = get_ue_golomb(&s->gb);
         sps->crop_top   = get_ue_golomb(&s->gb);
@@ -404,7 +405,7 @@ int ff_h264_decode_seq_parameter_set(H264Context *h){
         if(sps->crop_left || sps->crop_top){
             av_log(h->s.avctx, AV_LOG_ERROR, "insane cropping not completely supported, this could look slightly wrong ...\n");
         }
-        if(sps->crop_right >= crop_limit || sps->crop_bottom >= crop_limit){
+        if(sps->crop_right >= crop_horizontal_limit || sps->crop_bottom >= crop_vertical_limit){
             av_log(h->s.avctx, AV_LOG_ERROR, "brainfart cropping not supported, this could look slightly wrong ...\n");
         }
     }else{
diff --git a/libavcodec/h264data.h b/libavcodec/h264data.h
index 1851169dd3..2cfa548624 100644
--- a/libavcodec/h264data.h
+++ b/libavcodec/h264data.h
@@ -80,7 +80,14 @@ static const uint8_t luma_dc_field_scan[16]={
 
 static const uint8_t chroma_dc_scan[4]={
  (0+0*2)*16, (1+0*2)*16,
- (0+1*2)*16, (1+1*2)*16,  //FIXME
+ (0+1*2)*16, (1+1*2)*16,
+};
+
+static const uint8_t chroma422_dc_scan[8]={
+ (0+0*2)*16, (0+1*2)*16,
+ (1+0*2)*16, (0+2*2)*16,
+ (0+3*2)*16, (1+1*2)*16,
+ (1+2*2)*16, (1+3*2)*16,
 };
 
 // zigzag_scan8x8_cavlc[i] = zigzag_scan8x8[(i/4) + 16*(i%4)]
diff --git a/libavcodec/h264dsp.c b/libavcodec/h264dsp.c
index 64f4856189..19ad2db3d9 100644
--- a/libavcodec/h264dsp.c
+++ b/libavcodec/h264dsp.c
@@ -41,7 +41,7 @@
 #include "h264dsp_template.c"
 #undef BIT_DEPTH
 
-void ff_h264dsp_init(H264DSPContext *c, const int bit_depth)
+void ff_h264dsp_init(H264DSPContext *c, const int bit_depth, const int chroma_format_idc)
 {
 #undef FUNC
 #define FUNC(a, depth) a ## _ ## depth ## _c
@@ -53,10 +53,16 @@ void ff_h264dsp_init(H264DSPContext *c, const int bit_depth)
     c->h264_idct8_dc_add= FUNC(ff_h264_idct8_dc_add, depth);\
     c->h264_idct_add16     = FUNC(ff_h264_idct_add16, depth);\
     c->h264_idct8_add4     = FUNC(ff_h264_idct8_add4, depth);\
-    c->h264_idct_add8      = FUNC(ff_h264_idct_add8, depth);\
+    if (chroma_format_idc == 1)\
+        c->h264_idct_add8  = FUNC(ff_h264_idct_add8, depth);\
+    else\
+        c->h264_idct_add8  = FUNC(ff_h264_idct_add8_422, depth);\
     c->h264_idct_add16intra= FUNC(ff_h264_idct_add16intra, depth);\
     c->h264_luma_dc_dequant_idct= FUNC(ff_h264_luma_dc_dequant_idct, depth);\
-    c->h264_chroma_dc_dequant_idct= FUNC(ff_h264_chroma_dc_dequant_idct, depth);\
+    if (chroma_format_idc == 1)\
+        c->h264_chroma_dc_dequant_idct= FUNC(ff_h264_chroma_dc_dequant_idct, depth);\
+    else\
+        c->h264_chroma_dc_dequant_idct= FUNC(ff_h264_chroma422_dc_dequant_idct, depth);\
 \
     c->weight_h264_pixels_tab[0]= FUNC(weight_h264_pixels16x16, depth);\
     c->weight_h264_pixels_tab[1]= FUNC(weight_h264_pixels16x8, depth);\
@@ -86,11 +92,23 @@ void ff_h264dsp_init(H264DSPContext *c, const int bit_depth)
     c->h264_h_loop_filter_luma_intra= FUNC(h264_h_loop_filter_luma_intra, depth);\
     c->h264_h_loop_filter_luma_mbaff_intra= FUNC(h264_h_loop_filter_luma_mbaff_intra, depth);\
     c->h264_v_loop_filter_chroma= FUNC(h264_v_loop_filter_chroma, depth);\
-    c->h264_h_loop_filter_chroma= FUNC(h264_h_loop_filter_chroma, depth);\
-    c->h264_h_loop_filter_chroma_mbaff= FUNC(h264_h_loop_filter_chroma_mbaff, depth);\
+    if (chroma_format_idc == 1)\
+        c->h264_h_loop_filter_chroma= FUNC(h264_h_loop_filter_chroma, depth);\
+    else\
+        c->h264_h_loop_filter_chroma= FUNC(h264_h_loop_filter_chroma422, depth);\
+    if (chroma_format_idc == 1)\
+        c->h264_h_loop_filter_chroma_mbaff= FUNC(h264_h_loop_filter_chroma_mbaff, depth);\
+    else\
+        c->h264_h_loop_filter_chroma_mbaff= FUNC(h264_h_loop_filter_chroma422_mbaff, depth);\
     c->h264_v_loop_filter_chroma_intra= FUNC(h264_v_loop_filter_chroma_intra, depth);\
-    c->h264_h_loop_filter_chroma_intra= FUNC(h264_h_loop_filter_chroma_intra, depth);\
-    c->h264_h_loop_filter_chroma_mbaff_intra= FUNC(h264_h_loop_filter_chroma_mbaff_intra, depth);\
+    if (chroma_format_idc == 1)\
+        c->h264_h_loop_filter_chroma_intra= FUNC(h264_h_loop_filter_chroma_intra, depth);\
+    else\
+        c->h264_h_loop_filter_chroma_intra= FUNC(h264_h_loop_filter_chroma422_intra, depth);\
+    if (chroma_format_idc == 1)\
+        c->h264_h_loop_filter_chroma_mbaff_intra= FUNC(h264_h_loop_filter_chroma_mbaff_intra, depth);\
+    else\
+        c->h264_h_loop_filter_chroma_mbaff_intra= FUNC(h264_h_loop_filter_chroma422_mbaff_intra, depth);\
     c->h264_loop_filter_strength= NULL;
 
     switch (bit_depth) {
@@ -105,7 +123,7 @@ void ff_h264dsp_init(H264DSPContext *c, const int bit_depth)
         break;
     }
 
-    if (ARCH_ARM) ff_h264dsp_init_arm(c, bit_depth);
-    if (HAVE_ALTIVEC) ff_h264dsp_init_ppc(c, bit_depth);
-    if (HAVE_MMX) ff_h264dsp_init_x86(c, bit_depth);
+    if (ARCH_ARM) ff_h264dsp_init_arm(c, bit_depth, chroma_format_idc);
+    if (HAVE_ALTIVEC) ff_h264dsp_init_ppc(c, bit_depth, chroma_format_idc);
+    if (HAVE_MMX) ff_h264dsp_init_x86(c, bit_depth, chroma_format_idc);
 }
diff --git a/libavcodec/h264dsp.h b/libavcodec/h264dsp.h
index 6972725781..7337f178e9 100644
--- a/libavcodec/h264dsp.h
+++ b/libavcodec/h264dsp.h
@@ -74,9 +74,9 @@ typedef struct H264DSPContext{
     void (*h264_chroma_dc_dequant_idct)(DCTELEM *block, int qmul);
 }H264DSPContext;
 
-void ff_h264dsp_init(H264DSPContext *c, const int bit_depth);
-void ff_h264dsp_init_arm(H264DSPContext *c, const int bit_depth);
-void ff_h264dsp_init_ppc(H264DSPContext *c, const int bit_depth);
-void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth);
+void ff_h264dsp_init(H264DSPContext *c, const int bit_depth, const int chroma_format_idc);
+void ff_h264dsp_init_arm(H264DSPContext *c, const int bit_depth, const int chroma_format_idc);
+void ff_h264dsp_init_ppc(H264DSPContext *c, const int bit_depth, const int chroma_format_idc);
+void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, const int chroma_format_idc);
 
 #endif /* AVCODEC_H264DSP_H */
diff --git a/libavcodec/h264dsp_template.c b/libavcodec/h264dsp_template.c
index d11eff0919..ee4bbe51dc 100644
--- a/libavcodec/h264dsp_template.c
+++ b/libavcodec/h264dsp_template.c
@@ -275,6 +275,14 @@ static void FUNCC(h264_h_loop_filter_chroma_mbaff)(uint8_t *pix, int stride, int
 {
     FUNCC(h264_loop_filter_chroma)(pix, sizeof(pixel), stride, 1, alpha, beta, tc0);
 }
+static void FUNCC(h264_h_loop_filter_chroma422)(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
+{
+    FUNCC(h264_loop_filter_chroma)(pix, sizeof(pixel), stride, 4, alpha, beta, tc0);
+}
+static void FUNCC(h264_h_loop_filter_chroma422_mbaff)(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
+{
+    FUNCC(h264_loop_filter_chroma)(pix, sizeof(pixel), stride, 2, alpha, beta, tc0);
+}
 
 static av_always_inline av_flatten void FUNCC(h264_loop_filter_chroma_intra)(uint8_t *_pix, int xstride, int ystride, int inner_iters, int alpha, int beta)
 {
@@ -312,3 +320,11 @@ static void FUNCC(h264_h_loop_filter_chroma_mbaff_intra)(uint8_t *pix, int strid
 {
     FUNCC(h264_loop_filter_chroma_intra)(pix, sizeof(pixel), stride, 1, alpha, beta);
 }
+static void FUNCC(h264_h_loop_filter_chroma422_intra)(uint8_t *pix, int stride, int alpha, int beta)
+{
+    FUNCC(h264_loop_filter_chroma_intra)(pix, sizeof(pixel), stride, 4, alpha, beta);
+}
+static void FUNCC(h264_h_loop_filter_chroma422_mbaff_intra)(uint8_t *pix, int stride, int alpha, int beta)
+{
+    FUNCC(h264_loop_filter_chroma_intra)(pix, sizeof(pixel), stride, 2, alpha, beta);
+}
diff --git a/libavcodec/h264idct_template.c b/libavcodec/h264idct_template.c
index ba5571576d..eba850ac6f 100644
--- a/libavcodec/h264idct_template.c
+++ b/libavcodec/h264idct_template.c
@@ -224,6 +224,29 @@ void FUNCC(ff_h264_idct_add8)(uint8_t **dest, const int *block_offset, DCTELEM *
         }
     }
 }
+
+void FUNCC(ff_h264_idct_add8_422)(uint8_t **dest, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[15*8]){
+    int i, j;
+
+    for(j=1; j<3; j++){
+        for(i=j*16; i<j*16+4; i++){
+            if(nnzc[ scan8[i] ])
+                FUNCC(ff_h264_idct_add   )(dest[j-1] + block_offset[i], block + i*16*sizeof(pixel), stride);
+            else if(((dctcoef*)block)[i*16])
+                FUNCC(ff_h264_idct_dc_add)(dest[j-1] + block_offset[i], block + i*16*sizeof(pixel), stride);
+        }
+    }
+
+    for(j=1; j<3; j++){
+        for(i=j*16+4; i<j*16+8; i++){
+            if(nnzc[ scan8[i+4] ])
+                FUNCC(ff_h264_idct_add   )(dest[j-1] + block_offset[i+4], block + i*16*sizeof(pixel), stride);
+            else if(((dctcoef*)block)[i*16])
+                FUNCC(ff_h264_idct_dc_add)(dest[j-1] + block_offset[i+4], block + i*16*sizeof(pixel), stride);
+        }
+    }
+}
+
 /**
  * IDCT transforms the 16 dc values and dequantizes them.
  * @param qmul quantization parameter
@@ -263,6 +286,33 @@ void FUNCC(ff_h264_luma_dc_dequant_idct)(DCTELEM *_output, DCTELEM *_input, int
 #undef stride
 }
 
+void FUNCC(ff_h264_chroma422_dc_dequant_idct)(DCTELEM *_block, int qmul){
+    const int stride= 16*2;
+    const int xStride= 16;
+    int i;
+    int temp[8];
+    static const uint8_t x_offset[2]={0, 16};
+    dctcoef *block = (dctcoef*)_block;
+
+    for(i=0; i<4; i++){
+        temp[2*i+0] = block[stride*i + xStride*0] + block[stride*i + xStride*1];
+        temp[2*i+1] = block[stride*i + xStride*0] - block[stride*i + xStride*1];
+    }
+
+    for(i=0; i<2; i++){
+        const int offset= x_offset[i];
+        const int z0= temp[2*0+i] + temp[2*2+i];
+        const int z1= temp[2*0+i] - temp[2*2+i];
+        const int z2= temp[2*1+i] - temp[2*3+i];
+        const int z3= temp[2*1+i] + temp[2*3+i];
+
+        block[stride*0+offset]= ((z0 + z3)*qmul + 128) >> 8;
+        block[stride*1+offset]= ((z1 + z2)*qmul + 128) >> 8;
+        block[stride*2+offset]= ((z1 - z2)*qmul + 128) >> 8;
+        block[stride*3+offset]= ((z0 - z3)*qmul + 128) >> 8;
+    }
+}
+
 void FUNCC(ff_h264_chroma_dc_dequant_idct)(DCTELEM *_block, int qmul){
     const int stride= 16*2;
     const int xStride= 16;
diff --git a/libavcodec/h264pred.c b/libavcodec/h264pred.c
index e73d82c547..17199d01e6 100644
--- a/libavcodec/h264pred.c
+++ b/libavcodec/h264pred.c
@@ -361,7 +361,7 @@ static void pred8x8_tm_vp8_c(uint8_t *src, int stride){
 /**
  * Set the intra prediction function pointers.
  */
-void ff_h264_pred_init(H264PredContext *h, int codec_id, const int bit_depth){
+void ff_h264_pred_init(H264PredContext *h, int codec_id, const int bit_depth, const int chroma_format_idc){
 //    MpegEncContext * const s = &h->s;
 
 #undef FUNC
@@ -434,20 +434,39 @@ void ff_h264_pred_init(H264PredContext *h, int codec_id, const int bit_depth){
     h->pred8x8l[TOP_DC_PRED         ]= FUNCC(pred8x8l_top_dc              , depth);\
     h->pred8x8l[DC_128_PRED         ]= FUNCC(pred8x8l_128_dc              , depth);\
 \
-    h->pred8x8[VERT_PRED8x8   ]= FUNCC(pred8x8_vertical                   , depth);\
-    h->pred8x8[HOR_PRED8x8    ]= FUNCC(pred8x8_horizontal                 , depth);\
+    if (chroma_format_idc == 1) {\
+        h->pred8x8[VERT_PRED8x8   ]= FUNCC(pred8x8_vertical               , depth);\
+        h->pred8x8[HOR_PRED8x8    ]= FUNCC(pred8x8_horizontal             , depth);\
+    } else {\
+        h->pred8x8[VERT_PRED8x8   ]= FUNCC(pred8x16_vertical              , depth);\
+        h->pred8x8[HOR_PRED8x8    ]= FUNCC(pred8x16_horizontal            , depth);\
+    }\
     if (codec_id != CODEC_ID_VP8) {\
-        h->pred8x8[PLANE_PRED8x8]= FUNCC(pred8x8_plane                    , depth);\
+        if (chroma_format_idc == 1) {\
+            h->pred8x8[PLANE_PRED8x8]= FUNCC(pred8x8_plane                , depth);\
+        } else {\
+            h->pred8x8[PLANE_PRED8x8]= FUNCC(pred8x16_plane               , depth);\
+        }\
     } else\
         h->pred8x8[PLANE_PRED8x8]= FUNCD(pred8x8_tm_vp8);\
     if(codec_id != CODEC_ID_RV40 && codec_id != CODEC_ID_VP8){\
-        h->pred8x8[DC_PRED8x8     ]= FUNCC(pred8x8_dc                     , depth);\
-        h->pred8x8[LEFT_DC_PRED8x8]= FUNCC(pred8x8_left_dc                , depth);\
-        h->pred8x8[TOP_DC_PRED8x8 ]= FUNCC(pred8x8_top_dc                 , depth);\
-        h->pred8x8[ALZHEIMER_DC_L0T_PRED8x8 ]= FUNC(pred8x8_mad_cow_dc_l0t, depth);\
-        h->pred8x8[ALZHEIMER_DC_0LT_PRED8x8 ]= FUNC(pred8x8_mad_cow_dc_0lt, depth);\
-        h->pred8x8[ALZHEIMER_DC_L00_PRED8x8 ]= FUNC(pred8x8_mad_cow_dc_l00, depth);\
-        h->pred8x8[ALZHEIMER_DC_0L0_PRED8x8 ]= FUNC(pred8x8_mad_cow_dc_0l0, depth);\
+        if (chroma_format_idc == 1) {\
+            h->pred8x8[DC_PRED8x8     ]= FUNCC(pred8x8_dc                     , depth);\
+            h->pred8x8[LEFT_DC_PRED8x8]= FUNCC(pred8x8_left_dc                , depth);\
+            h->pred8x8[TOP_DC_PRED8x8 ]= FUNCC(pred8x8_top_dc                 , depth);\
+            h->pred8x8[ALZHEIMER_DC_L0T_PRED8x8 ]= FUNC(pred8x8_mad_cow_dc_l0t, depth);\
+            h->pred8x8[ALZHEIMER_DC_0LT_PRED8x8 ]= FUNC(pred8x8_mad_cow_dc_0lt, depth);\
+            h->pred8x8[ALZHEIMER_DC_L00_PRED8x8 ]= FUNC(pred8x8_mad_cow_dc_l00, depth);\
+            h->pred8x8[ALZHEIMER_DC_0L0_PRED8x8 ]= FUNC(pred8x8_mad_cow_dc_0l0, depth);\
+        } else {\
+            h->pred8x8[DC_PRED8x8     ]= FUNCC(pred8x16_dc                    , depth);\
+            h->pred8x8[LEFT_DC_PRED8x8]= FUNCC(pred8x16_left_dc               , depth);\
+            h->pred8x8[TOP_DC_PRED8x8 ]= FUNCC(pred8x16_top_dc                , depth);\
+            h->pred8x8[ALZHEIMER_DC_L0T_PRED8x8 ]= FUNC(pred8x8_mad_cow_dc_l0t, depth);\
+            h->pred8x8[ALZHEIMER_DC_0LT_PRED8x8 ]= FUNC(pred8x8_mad_cow_dc_0lt, depth);\
+            h->pred8x8[ALZHEIMER_DC_L00_PRED8x8 ]= FUNC(pred8x8_mad_cow_dc_l00, depth);\
+            h->pred8x8[ALZHEIMER_DC_0L0_PRED8x8 ]= FUNC(pred8x8_mad_cow_dc_0l0, depth);\
+        }\
     }else{\
         h->pred8x8[DC_PRED8x8     ]= FUNCD(pred8x8_dc_rv40);\
         h->pred8x8[LEFT_DC_PRED8x8]= FUNCD(pred8x8_left_dc_rv40);\
@@ -457,7 +476,11 @@ void ff_h264_pred_init(H264PredContext *h, int codec_id, const int bit_depth){
             h->pred8x8[DC_129_PRED8x8]= FUNCC(pred8x8_129_dc              , depth);\
         }\
     }\
-    h->pred8x8[DC_128_PRED8x8 ]= FUNCC(pred8x8_128_dc                     , depth);\
+    if (chroma_format_idc == 1) {\
+        h->pred8x8[DC_128_PRED8x8 ]= FUNCC(pred8x8_128_dc                 , depth);\
+    } else {\
+        h->pred8x8[DC_128_PRED8x8 ]= FUNCC(pred8x16_128_dc                , depth);\
+    }\
 \
     h->pred16x16[DC_PRED8x8     ]= FUNCC(pred16x16_dc                     , depth);\
     h->pred16x16[VERT_PRED8x8   ]= FUNCC(pred16x16_vertical               , depth);\
@@ -504,6 +527,6 @@ void ff_h264_pred_init(H264PredContext *h, int codec_id, const int bit_depth){
             break;
     }
 
-    if (ARCH_ARM) ff_h264_pred_init_arm(h, codec_id, bit_depth);
-    if (HAVE_MMX) ff_h264_pred_init_x86(h, codec_id, bit_depth);
+    if (ARCH_ARM) ff_h264_pred_init_arm(h, codec_id, bit_depth, chroma_format_idc);
+    if (HAVE_MMX) ff_h264_pred_init_x86(h, codec_id, bit_depth, chroma_format_idc);
 }
diff --git a/libavcodec/h264pred.h b/libavcodec/h264pred.h
index 34b1e90bbc..b880446121 100644
--- a/libavcodec/h264pred.h
+++ b/libavcodec/h264pred.h
@@ -101,8 +101,8 @@ typedef struct H264PredContext{
     void (*pred16x16_add[3])(uint8_t *pix/*align 16*/, const int *block_offset, const DCTELEM *block/*align 16*/, int stride);
 }H264PredContext;
 
-void ff_h264_pred_init(H264PredContext *h, int codec_id, const int bit_depth);
-void ff_h264_pred_init_arm(H264PredContext *h, int codec_id, const int bit_depth);
-void ff_h264_pred_init_x86(H264PredContext *h, int codec_id, const int bit_depth);
+void ff_h264_pred_init(H264PredContext *h, int codec_id, const int bit_depth, const int chroma_format_idc);
+void ff_h264_pred_init_arm(H264PredContext *h, int codec_id, const int bit_depth, const int chroma_format_idc);
+void ff_h264_pred_init_x86(H264PredContext *h, int codec_id, const int bit_depth, const int chroma_format_idc);
 
 #endif /* AVCODEC_H264PRED_H */
diff --git a/libavcodec/h264pred_template.c b/libavcodec/h264pred_template.c
index 750e82c12a..d4f654e18c 100644
--- a/libavcodec/h264pred_template.c
+++ b/libavcodec/h264pred_template.c
@@ -454,6 +454,19 @@ static void FUNCC(pred8x8_vertical)(uint8_t *_src, int _stride){
     }
 }
 
+static void FUNCC(pred8x16_vertical)(uint8_t *_src, int _stride){
+    int i;
+    pixel *src = (pixel*)_src;
+    int stride = _stride>>(sizeof(pixel)-1);
+    const pixel4 a= AV_RN4PA(((pixel4*)(src-stride))+0);
+    const pixel4 b= AV_RN4PA(((pixel4*)(src-stride))+1);
+
+    for(i=0; i<16; i++){
+        AV_WN4PA(((pixel4*)(src+i*stride))+0, a);
+        AV_WN4PA(((pixel4*)(src+i*stride))+1, b);
+    }
+}
+
 static void FUNCC(pred8x8_horizontal)(uint8_t *_src, int stride){
     int i;
     pixel *src = (pixel*)_src;
@@ -466,6 +479,17 @@ static void FUNCC(pred8x8_horizontal)(uint8_t *_src, int stride){
     }
 }
 
+static void FUNCC(pred8x16_horizontal)(uint8_t *_src, int stride){
+    int i;
+    pixel *src = (pixel*)_src;
+    stride >>= sizeof(pixel)-1;
+    for(i=0; i<16; i++){
+        const pixel4 a = PIXEL_SPLAT_X4(src[-1+i*stride]);
+        AV_WN4PA(((pixel4*)(src+i*stride))+0, a);
+        AV_WN4PA(((pixel4*)(src+i*stride))+1, a);
+    }
+}
+
 #define PRED8x8_X(n, v)\
 static void FUNCC(pred8x8_##n##_dc)(uint8_t *_src, int stride){\
     int i;\
@@ -482,6 +506,11 @@ PRED8x8_X(127, (1<<(BIT_DEPTH-1))-1);
 PRED8x8_X(128, (1<<(BIT_DEPTH-1))+0);
 PRED8x8_X(129, (1<<(BIT_DEPTH-1))+1);
 
+static void FUNCC(pred8x16_128_dc)(uint8_t *_src, int stride){
+    FUNCC(pred8x8_128_dc)(_src, stride);
+    FUNCC(pred8x8_128_dc)(_src+8*stride, stride);
+}
+
 static void FUNCC(pred8x8_left_dc)(uint8_t *_src, int stride){
     int i;
     int dc0, dc2;
@@ -507,6 +536,11 @@ static void FUNCC(pred8x8_left_dc)(uint8_t *_src, int stride){
     }
 }
 
+static void FUNCC(pred8x16_left_dc)(uint8_t *_src, int stride){
+    FUNCC(pred8x8_left_dc)(_src, stride);
+    FUNCC(pred8x8_left_dc)(_src+8*stride, stride);
+}
+
 static void FUNCC(pred8x8_top_dc)(uint8_t *_src, int stride){
     int i;
     int dc0, dc1;
@@ -532,6 +566,27 @@ static void FUNCC(pred8x8_top_dc)(uint8_t *_src, int stride){
     }
 }
 
+static void FUNCC(pred8x16_top_dc)(uint8_t *_src, int stride){
+    int i;
+    int dc0, dc1;
+    pixel4 dc0splat, dc1splat;
+    pixel *src = (pixel*)_src;
+    stride >>= sizeof(pixel)-1;
+
+    dc0=dc1=0;
+    for(i=0;i<4; i++){
+        dc0+= src[i-stride];
+        dc1+= src[4+i-stride];
+    }
+    dc0splat = PIXEL_SPLAT_X4((dc0 + 2)>>2);
+    dc1splat = PIXEL_SPLAT_X4((dc1 + 2)>>2);
+
+    for(i=0; i<16; i++){
+        AV_WN4PA(((pixel4*)(src+i*stride))+0, dc0splat);
+        AV_WN4PA(((pixel4*)(src+i*stride))+1, dc1splat);
+    }
+}
+
 static void FUNCC(pred8x8_dc)(uint8_t *_src, int stride){
     int i;
     int dc0, dc1, dc2;
@@ -560,6 +615,48 @@ static void FUNCC(pred8x8_dc)(uint8_t *_src, int stride){
     }
 }
 
+static void FUNCC(pred8x16_dc)(uint8_t *_src, int stride){
+    int i;
+    int dc0, dc1, dc2, dc3, dc4;
+    pixel4 dc0splat, dc1splat, dc2splat, dc3splat, dc4splat, dc5splat, dc6splat, dc7splat;
+    pixel *src = (pixel*)_src;
+    stride >>= sizeof(pixel)-1;
+
+    dc0=dc1=dc2=dc3=dc4=0;
+    for(i=0;i<4; i++){
+        dc0+= src[-1+i*stride] + src[i-stride];
+        dc1+= src[4+i-stride];
+        dc2+= src[-1+(i+4)*stride];
+        dc3+= src[-1+(i+8)*stride];
+        dc4+= src[-1+(i+12)*stride];
+    }
+    dc0splat = PIXEL_SPLAT_X4((dc0 + 4)>>3);
+    dc1splat = PIXEL_SPLAT_X4((dc1 + 2)>>2);
+    dc2splat = PIXEL_SPLAT_X4((dc2 + 2)>>2);
+    dc3splat = PIXEL_SPLAT_X4((dc1 + dc2 + 4)>>3);
+    dc4splat = PIXEL_SPLAT_X4((dc3 + 2)>>2);
+    dc5splat = PIXEL_SPLAT_X4((dc1 + dc3 + 4)>>3);
+    dc6splat = PIXEL_SPLAT_X4((dc4 + 2)>>2);
+    dc7splat = PIXEL_SPLAT_X4((dc1 + dc4 + 4)>>3);
+
+    for(i=0; i<4; i++){
+        AV_WN4PA(((pixel4*)(src+i*stride))+0, dc0splat);
+        AV_WN4PA(((pixel4*)(src+i*stride))+1, dc1splat);
+    }
+    for(i=4; i<8; i++){
+        AV_WN4PA(((pixel4*)(src+i*stride))+0, dc2splat);
+        AV_WN4PA(((pixel4*)(src+i*stride))+1, dc3splat);
+    }
+    for(i=8; i<12; i++){
+        AV_WN4PA(((pixel4*)(src+i*stride))+0, dc4splat);
+        AV_WN4PA(((pixel4*)(src+i*stride))+1, dc5splat);
+    }
+    for(i=12; i<16; i++){
+        AV_WN4PA(((pixel4*)(src+i*stride))+0, dc6splat);
+        AV_WN4PA(((pixel4*)(src+i*stride))+1, dc7splat);
+    }
+}
+
 //the following 4 function should not be optimized!
 static void FUNC(pred8x8_mad_cow_dc_l0t)(uint8_t *src, int stride){
     FUNCC(pred8x8_top_dc)(src, stride);
@@ -618,6 +715,47 @@ static void FUNCC(pred8x8_plane)(uint8_t *_src, int _stride){
   }
 }
 
+static void FUNCC(pred8x16_plane)(uint8_t *_src, int _stride){
+  int j, k;
+  int a;
+  INIT_CLIP
+  pixel *src = (pixel*)_src;
+  int stride = _stride>>(sizeof(pixel)-1);
+  const pixel * const src0 = src +3-stride;
+  const pixel *       src1 = src +8*stride-1;
+  const pixel *       src2 = src1-2*stride;    // == src+6*stride-1;
+  int H = src0[1] - src0[-1];
+  int V = src1[0] - src2[ 0];
+
+  for (k = 2; k <= 4; ++k) {
+      src1 += stride; src2 -= stride;
+      H += k*(src0[k] - src0[-k]);
+      V += k*(src1[0] - src2[ 0]);
+  }
+  for (; k <= 8; ++k) {
+      src1 += stride; src2 -= stride;
+      V += k*(src1[0] - src2[0]);
+  }
+
+  H = (17*H+16) >> 5;
+  V = (5*V+32) >> 6;
+
+  a = 16*(src1[0] + src2[8] + 1) - 7*V - 3*H;
+  for(j=16; j>0; --j) {
+    int b = a;
+    a += V;
+    src[0] = CLIP((b    ) >> 5);
+    src[1] = CLIP((b+  H) >> 5);
+    src[2] = CLIP((b+2*H) >> 5);
+    src[3] = CLIP((b+3*H) >> 5);
+    src[4] = CLIP((b+4*H) >> 5);
+    src[5] = CLIP((b+5*H) >> 5);
+    src[6] = CLIP((b+6*H) >> 5);
+    src[7] = CLIP((b+7*H) >> 5);
+    src += stride;
+  }
+}
+
 #define SRC(x,y) src[(x)+(y)*stride]
 #define PL(y) \
     const int l##y = (SRC(-1,y-1) + 2*SRC(-1,y) + SRC(-1,y+1) + 2) >> 2;
diff --git a/libavcodec/ppc/h264_altivec.c b/libavcodec/ppc/h264_altivec.c
index 8dd4ea392e..a9153788de 100644
--- a/libavcodec/ppc/h264_altivec.c
+++ b/libavcodec/ppc/h264_altivec.c
@@ -999,12 +999,13 @@ void dsputil_h264_init_ppc(DSPContext* c, AVCodecContext *avctx) {
     }
 }
 
-void ff_h264dsp_init_ppc(H264DSPContext *c, const int bit_depth)
+void ff_h264dsp_init_ppc(H264DSPContext *c, const int bit_depth, const int chroma_format_idc)
 {
     if (av_get_cpu_flags() & AV_CPU_FLAG_ALTIVEC) {
     if (bit_depth == 8) {
         c->h264_idct_add = ff_h264_idct_add_altivec;
-        c->h264_idct_add8 = ff_h264_idct_add8_altivec;
+        if (chroma_format_idc == 1)
+            c->h264_idct_add8 = ff_h264_idct_add8_altivec;
         c->h264_idct_add16 = ff_h264_idct_add16_altivec;
         c->h264_idct_add16intra = ff_h264_idct_add16intra_altivec;
         c->h264_idct_dc_add= h264_idct_dc_add_altivec;
diff --git a/libavcodec/rv34.c b/libavcodec/rv34.c
index b771a7f97e..091d49fdb5 100644
--- a/libavcodec/rv34.c
+++ b/libavcodec/rv34.c
@@ -1343,7 +1343,7 @@ av_cold int ff_rv34_decode_init(AVCodecContext *avctx)
     if (MPV_common_init(s) < 0)
         return -1;
 
-    ff_h264_pred_init(&r->h, CODEC_ID_RV40, 8);
+    ff_h264_pred_init(&r->h, CODEC_ID_RV40, 8, 1);
 
 #if CONFIG_RV30_DECODER
     if (avctx->codec_id == CODEC_ID_RV30)
diff --git a/libavcodec/vp8.c b/libavcodec/vp8.c
index d5cdaba486..95755e330a 100644
--- a/libavcodec/vp8.c
+++ b/libavcodec/vp8.c
@@ -1769,7 +1769,7 @@ static av_cold int vp8_decode_init(AVCodecContext *avctx)
     avctx->pix_fmt = PIX_FMT_YUV420P;
 
     dsputil_init(&s->dsp, avctx);
-    ff_h264_pred_init(&s->hpc, CODEC_ID_VP8, 8);
+    ff_h264_pred_init(&s->hpc, CODEC_ID_VP8, 8, 1);
     ff_vp8dsp_init(&s->vp8dsp);
 
     return 0;
diff --git a/libavcodec/x86/h264_intrapred_init.c b/libavcodec/x86/h264_intrapred_init.c
index 414d5e6125..41e611ecd1 100644
--- a/libavcodec/x86/h264_intrapred_init.c
+++ b/libavcodec/x86/h264_intrapred_init.c
@@ -167,7 +167,7 @@ void ff_pred4x4_tm_vp8_mmxext      (uint8_t *src, const uint8_t *topright, int s
 void ff_pred4x4_tm_vp8_ssse3       (uint8_t *src, const uint8_t *topright, int stride);
 void ff_pred4x4_vertical_vp8_mmxext(uint8_t *src, const uint8_t *topright, int stride);
 
-void ff_h264_pred_init_x86(H264PredContext *h, int codec_id, const int bit_depth)
+void ff_h264_pred_init_x86(H264PredContext *h, int codec_id, const int bit_depth, const int chroma_format_idc)
 {
 #if HAVE_YASM
     int mm_flags = av_get_cpu_flags();
@@ -176,14 +176,17 @@ void ff_h264_pred_init_x86(H264PredContext *h, int codec_id, const int bit_depth
         if (mm_flags & AV_CPU_FLAG_MMX) {
             h->pred16x16[VERT_PRED8x8         ] = ff_pred16x16_vertical_mmx;
             h->pred16x16[HOR_PRED8x8          ] = ff_pred16x16_horizontal_mmx;
-            h->pred8x8  [VERT_PRED8x8         ] = ff_pred8x8_vertical_mmx;
-            h->pred8x8  [HOR_PRED8x8          ] = ff_pred8x8_horizontal_mmx;
+            if (chroma_format_idc == 1) {
+                h->pred8x8  [VERT_PRED8x8     ] = ff_pred8x8_vertical_mmx;
+                h->pred8x8  [HOR_PRED8x8      ] = ff_pred8x8_horizontal_mmx;
+            }
             if (codec_id == CODEC_ID_VP8) {
                 h->pred16x16[PLANE_PRED8x8    ] = ff_pred16x16_tm_vp8_mmx;
                 h->pred8x8  [PLANE_PRED8x8    ] = ff_pred8x8_tm_vp8_mmx;
                 h->pred4x4  [TM_VP8_PRED      ] = ff_pred4x4_tm_vp8_mmx;
             } else {
-                h->pred8x8  [PLANE_PRED8x8] = ff_pred8x8_plane_mmx;
+                if (chroma_format_idc == 1)
+                    h->pred8x8  [PLANE_PRED8x8] = ff_pred8x8_plane_mmx;
                 if (codec_id == CODEC_ID_SVQ3) {
                     h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_svq3_mmx;
                 } else if (codec_id == CODEC_ID_RV40) {
@@ -197,7 +200,8 @@ void ff_h264_pred_init_x86(H264PredContext *h, int codec_id, const int bit_depth
         if (mm_flags & AV_CPU_FLAG_MMX2) {
             h->pred16x16[HOR_PRED8x8            ] = ff_pred16x16_horizontal_mmxext;
             h->pred16x16[DC_PRED8x8             ] = ff_pred16x16_dc_mmxext;
-            h->pred8x8  [HOR_PRED8x8            ] = ff_pred8x8_horizontal_mmxext;
+            if (chroma_format_idc == 1)
+                h->pred8x8[HOR_PRED8x8          ] = ff_pred8x8_horizontal_mmxext;
             h->pred8x8l [TOP_DC_PRED            ] = ff_pred8x8l_top_dc_mmxext;
             h->pred8x8l [DC_PRED                ] = ff_pred8x8l_dc_mmxext;
             h->pred8x8l [HOR_PRED               ] = ff_pred8x8l_horizontal_mmxext;
@@ -221,8 +225,10 @@ void ff_h264_pred_init_x86(H264PredContext *h, int codec_id, const int bit_depth
                 h->pred4x4  [HOR_UP_PRED        ] = ff_pred4x4_horizontal_up_mmxext;
             }
             if (codec_id == CODEC_ID_SVQ3 || codec_id == CODEC_ID_H264) {
-                h->pred8x8  [TOP_DC_PRED8x8     ] = ff_pred8x8_top_dc_mmxext;
-                h->pred8x8  [DC_PRED8x8         ] = ff_pred8x8_dc_mmxext;
+                if (chroma_format_idc == 1) {
+                    h->pred8x8[TOP_DC_PRED8x8   ] = ff_pred8x8_top_dc_mmxext;
+                    h->pred8x8[DC_PRED8x8       ] = ff_pred8x8_dc_mmxext;
+                }
             }
             if (codec_id == CODEC_ID_VP8) {
                 h->pred16x16[PLANE_PRED8x8      ] = ff_pred16x16_tm_vp8_mmxext;
@@ -231,7 +237,8 @@ void ff_h264_pred_init_x86(H264PredContext *h, int codec_id, const int bit_depth
                 h->pred4x4  [TM_VP8_PRED        ] = ff_pred4x4_tm_vp8_mmxext;
                 h->pred4x4  [VERT_PRED          ] = ff_pred4x4_vertical_vp8_mmxext;
             } else {
-                h->pred8x8  [PLANE_PRED8x8] = ff_pred8x8_plane_mmx2;
+                if (chroma_format_idc == 1)
+                    h->pred8x8  [PLANE_PRED8x8] = ff_pred8x8_plane_mmx2;
                 if (codec_id == CODEC_ID_SVQ3) {
                     h->pred16x16[PLANE_PRED8x8  ] = ff_pred16x16_plane_svq3_mmx2;
                 } else if (codec_id == CODEC_ID_RV40) {
@@ -257,7 +264,8 @@ void ff_h264_pred_init_x86(H264PredContext *h, int codec_id, const int bit_depth
                 h->pred16x16[PLANE_PRED8x8    ] = ff_pred16x16_tm_vp8_sse2;
                 h->pred8x8  [PLANE_PRED8x8    ] = ff_pred8x8_tm_vp8_sse2;
             } else {
-                h->pred8x8  [PLANE_PRED8x8    ] = ff_pred8x8_plane_sse2;
+                if (chroma_format_idc == 1)
+                    h->pred8x8  [PLANE_PRED8x8] = ff_pred8x8_plane_sse2;
                 if (codec_id == CODEC_ID_SVQ3) {
                     h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_svq3_sse2;
                 } else if (codec_id == CODEC_ID_RV40) {
@@ -271,7 +279,8 @@ void ff_h264_pred_init_x86(H264PredContext *h, int codec_id, const int bit_depth
         if (mm_flags & AV_CPU_FLAG_SSSE3) {
             h->pred16x16[HOR_PRED8x8          ] = ff_pred16x16_horizontal_ssse3;
             h->pred16x16[DC_PRED8x8           ] = ff_pred16x16_dc_ssse3;
-            h->pred8x8  [HOR_PRED8x8          ] = ff_pred8x8_horizontal_ssse3;
+            if (chroma_format_idc == 1)
+                h->pred8x8  [HOR_PRED8x8      ] = ff_pred8x8_horizontal_ssse3;
             h->pred8x8l [TOP_DC_PRED          ] = ff_pred8x8l_top_dc_ssse3;
             h->pred8x8l [DC_PRED              ] = ff_pred8x8l_dc_ssse3;
             h->pred8x8l [HOR_PRED             ] = ff_pred8x8l_horizontal_ssse3;
@@ -286,7 +295,8 @@ void ff_h264_pred_init_x86(H264PredContext *h, int codec_id, const int bit_depth
                 h->pred8x8  [PLANE_PRED8x8    ] = ff_pred8x8_tm_vp8_ssse3;
                 h->pred4x4  [TM_VP8_PRED      ] = ff_pred4x4_tm_vp8_ssse3;
             } else {
-                h->pred8x8  [PLANE_PRED8x8] = ff_pred8x8_plane_ssse3;
+                if (chroma_format_idc == 1)
+                    h->pred8x8  [PLANE_PRED8x8] = ff_pred8x8_plane_ssse3;
                 if (codec_id == CODEC_ID_SVQ3) {
                     h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_svq3_ssse3;
                 } else if (codec_id == CODEC_ID_RV40) {
@@ -301,7 +311,8 @@ void ff_h264_pred_init_x86(H264PredContext *h, int codec_id, const int bit_depth
             h->pred4x4[DC_PRED             ] = ff_pred4x4_dc_10_mmxext;
             h->pred4x4[HOR_UP_PRED         ] = ff_pred4x4_horizontal_up_10_mmxext;
 
-            h->pred8x8[DC_PRED8x8          ] = ff_pred8x8_dc_10_mmxext;
+            if (chroma_format_idc == 1)
+                h->pred8x8[DC_PRED8x8      ] = ff_pred8x8_dc_10_mmxext;
 
             h->pred8x8l[DC_128_PRED        ] = ff_pred8x8l_128_dc_10_mmxext;
 
@@ -319,11 +330,13 @@ void ff_h264_pred_init_x86(H264PredContext *h, int codec_id, const int bit_depth
             h->pred4x4[VERT_RIGHT_PRED     ] = ff_pred4x4_vertical_right_10_sse2;
             h->pred4x4[HOR_DOWN_PRED       ] = ff_pred4x4_horizontal_down_10_sse2;
 
-            h->pred8x8[DC_PRED8x8          ] = ff_pred8x8_dc_10_sse2;
-            h->pred8x8[TOP_DC_PRED8x8      ] = ff_pred8x8_top_dc_10_sse2;
-            h->pred8x8[PLANE_PRED8x8       ] = ff_pred8x8_plane_10_sse2;
-            h->pred8x8[VERT_PRED8x8        ] = ff_pred8x8_vertical_10_sse2;
-            h->pred8x8[HOR_PRED8x8         ] = ff_pred8x8_horizontal_10_sse2;
+            if (chroma_format_idc == 1) {
+                h->pred8x8[DC_PRED8x8      ] = ff_pred8x8_dc_10_sse2;
+                h->pred8x8[TOP_DC_PRED8x8  ] = ff_pred8x8_top_dc_10_sse2;
+                h->pred8x8[PLANE_PRED8x8   ] = ff_pred8x8_plane_10_sse2;
+                h->pred8x8[VERT_PRED8x8    ] = ff_pred8x8_vertical_10_sse2;
+                h->pred8x8[HOR_PRED8x8     ] = ff_pred8x8_horizontal_10_sse2;
+            }
 
             h->pred8x8l[VERT_PRED           ] = ff_pred8x8l_vertical_10_sse2;
             h->pred8x8l[HOR_PRED            ] = ff_pred8x8l_horizontal_10_sse2;
diff --git a/libavcodec/x86/h264dsp_mmx.c b/libavcodec/x86/h264dsp_mmx.c
index 35ec267b42..910ad8401f 100644
--- a/libavcodec/x86/h264dsp_mmx.c
+++ b/libavcodec/x86/h264dsp_mmx.c
@@ -350,7 +350,7 @@ H264_BIWEIGHT_10_SSE( 4,  8, 10)
 H264_BIWEIGHT_10_SSE( 4,  4, 10)
 H264_BIWEIGHT_10_SSE( 4,  2, 10)
 
-void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth)
+void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, const int chroma_format_idc)
 {
     int mm_flags = av_get_cpu_flags();
 
@@ -368,7 +368,8 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth)
 
         c->h264_idct_add16          = ff_h264_idct_add16_8_mmx;
         c->h264_idct8_add4          = ff_h264_idct8_add4_8_mmx;
-        c->h264_idct_add8           = ff_h264_idct_add8_8_mmx;
+        if (chroma_format_idc == 1)
+            c->h264_idct_add8       = ff_h264_idct_add8_8_mmx;
         c->h264_idct_add16intra     = ff_h264_idct_add16intra_8_mmx;
         c->h264_luma_dc_dequant_idct= ff_h264_luma_dc_dequant_idct_mmx;
 
@@ -377,13 +378,16 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth)
             c->h264_idct8_dc_add   = ff_h264_idct8_dc_add_8_mmx2;
             c->h264_idct_add16     = ff_h264_idct_add16_8_mmx2;
             c->h264_idct8_add4     = ff_h264_idct8_add4_8_mmx2;
-            c->h264_idct_add8      = ff_h264_idct_add8_8_mmx2;
+            if (chroma_format_idc == 1)
+                c->h264_idct_add8  = ff_h264_idct_add8_8_mmx2;
             c->h264_idct_add16intra= ff_h264_idct_add16intra_8_mmx2;
 
             c->h264_v_loop_filter_chroma= ff_deblock_v_chroma_8_mmxext;
-            c->h264_h_loop_filter_chroma= ff_deblock_h_chroma_8_mmxext;
             c->h264_v_loop_filter_chroma_intra= ff_deblock_v_chroma_intra_8_mmxext;
-            c->h264_h_loop_filter_chroma_intra= ff_deblock_h_chroma_intra_8_mmxext;
+            if (chroma_format_idc == 1) {
+                c->h264_h_loop_filter_chroma= ff_deblock_h_chroma_8_mmxext;
+                c->h264_h_loop_filter_chroma_intra= ff_deblock_h_chroma_intra_8_mmxext;
+            }
 #if ARCH_X86_32
             c->h264_v_loop_filter_luma= ff_deblock_v_luma_8_mmxext;
             c->h264_h_loop_filter_luma= ff_deblock_h_luma_8_mmxext;
@@ -413,7 +417,8 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth)
 
                 c->h264_idct_add16          = ff_h264_idct_add16_8_sse2;
                 c->h264_idct8_add4          = ff_h264_idct8_add4_8_sse2;
-                c->h264_idct_add8           = ff_h264_idct_add8_8_sse2;
+                if (chroma_format_idc == 1)
+                    c->h264_idct_add8       = ff_h264_idct_add8_8_sse2;
                 c->h264_idct_add16intra     = ff_h264_idct_add16intra_8_sse2;
                 c->h264_luma_dc_dequant_idct= ff_h264_luma_dc_dequant_idct_sse2;
 
@@ -472,7 +477,8 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth)
                 c->h264_idct8_dc_add   = ff_h264_idct8_dc_add_10_sse2;
 
                 c->h264_idct_add16     = ff_h264_idct_add16_10_sse2;
-                c->h264_idct_add8      = ff_h264_idct_add8_10_sse2;
+                if (chroma_format_idc == 1)
+                    c->h264_idct_add8  = ff_h264_idct_add8_10_sse2;
                 c->h264_idct_add16intra= ff_h264_idct_add16intra_10_sse2;
 #if HAVE_ALIGNED_STACK
                 c->h264_idct8_add      = ff_h264_idct8_add_10_sse2;
@@ -532,7 +538,8 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth)
                 c->h264_idct8_dc_add   = ff_h264_idct8_dc_add_10_avx;
 
                 c->h264_idct_add16     = ff_h264_idct_add16_10_avx;
-                c->h264_idct_add8      = ff_h264_idct_add8_10_avx;
+                if (chroma_format_idc == 1)
+                    c->h264_idct_add8  = ff_h264_idct_add8_10_avx;
                 c->h264_idct_add16intra= ff_h264_idct_add16intra_10_avx;
 #if HAVE_ALIGNED_STACK
                 c->h264_idct8_add      = ff_h264_idct8_add_10_avx;

From 229d263cc914b5396847f7249fdda2e6ded9ec1b Mon Sep 17 00:00:00 2001
From: "Ronald S. Bultje" <rsbultje@gmail.com>
Date: Wed, 12 Oct 2011 08:55:37 -0700
Subject: [PATCH 06/35] Support for lossless and inter H264 4:2:2.

---
 libavcodec/h264.c              | 66 ++++++++++++++++++++++++++++------
 libavcodec/h264pred.c          | 13 ++++---
 libavcodec/h264pred_template.c | 39 +++++++++++++++++++-
 libavcodec/x86/h264dsp_mmx.c   |  2 +-
 4 files changed, 103 insertions(+), 17 deletions(-)

diff --git a/libavcodec/h264.c b/libavcodec/h264.c
index f61f524508..8d652f13ce 100644
--- a/libavcodec/h264.c
+++ b/libavcodec/h264.c
@@ -457,6 +457,7 @@ static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square,
     const int full_my= my>>2;
     const int pic_width  = 16*s->mb_width;
     const int pic_height = 16*s->mb_height >> MB_FIELD;
+    int ysh;
 
     if(mx&7) extra_width -= 3;
     if(my&7) extra_height -= 3;
@@ -465,7 +466,8 @@ static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square,
        || full_my < 0-extra_height
        || full_mx + 16/*FIXME*/ > pic_width + extra_width
        || full_my + 16/*FIXME*/ > pic_height + extra_height){
-        s->dsp.emulated_edge_mc(s->edge_emu_buffer, src_y - (2 << pixel_shift) - 2*h->mb_linesize, h->mb_linesize, 16+5, 16+5/*FIXME*/, full_mx-2, full_my-2, pic_width, pic_height);
+        s->dsp.emulated_edge_mc(s->edge_emu_buffer, src_y - (2 << pixel_shift) - 2*h->mb_linesize, h->mb_linesize,
+                                16+5, 16+5/*FIXME*/, full_mx-2, full_my-2, pic_width, pic_height);
             src_y= s->edge_emu_buffer + (2 << pixel_shift) + 2*h->mb_linesize;
         emu=1;
     }
@@ -502,25 +504,27 @@ static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square,
         return;
     }
 
-    if(MB_FIELD){
+    ysh = 3 - !!(CHROMA422);
+    if(!CHROMA422 && MB_FIELD){
         // chroma offset when predicting from a field of opposite parity
         my += 2 * ((s->mb_y & 1) - (pic->f.reference - 1));
         emu |= (my>>3) < 0 || (my>>3) + 8 >= (pic_height>>1);
     }
-    src_cb = pic->f.data[1] + ((mx >> 3) << pixel_shift) + (my >> 3) * h->mb_uvlinesize;
-    src_cr = pic->f.data[2] + ((mx >> 3) << pixel_shift) + (my >> 3) * h->mb_uvlinesize;
+
+    src_cb = pic->f.data[1] + ((mx >> 3) << pixel_shift) + (my >> ysh) * h->mb_uvlinesize;
+    src_cr = pic->f.data[2] + ((mx >> 3) << pixel_shift) + (my >> ysh) * h->mb_uvlinesize;
 
     if(emu){
-        s->dsp.emulated_edge_mc(s->edge_emu_buffer, src_cb, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
+        s->dsp.emulated_edge_mc(s->edge_emu_buffer, src_cb, h->mb_uvlinesize, 9, (16>>!(CHROMA422)) + 1, (mx>>3), (my>>ysh), pic_width>>1, pic_height>>!(CHROMA422));
             src_cb= s->edge_emu_buffer;
     }
-    chroma_op(dest_cb, src_cb, h->mb_uvlinesize, chroma_height, mx&7, my&7);
+    chroma_op(dest_cb, src_cb, h->mb_uvlinesize, chroma_height << !!(CHROMA422), mx&7, (my << !!(CHROMA422)) &7);
 
     if(emu){
-        s->dsp.emulated_edge_mc(s->edge_emu_buffer, src_cr, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
+        s->dsp.emulated_edge_mc(s->edge_emu_buffer, src_cr, h->mb_uvlinesize, 9, (16>>!(CHROMA422)) + 1, (mx>>3), (my>>ysh), pic_width>>1, pic_height>>!(CHROMA422));
             src_cr= s->edge_emu_buffer;
     }
-    chroma_op(dest_cr, src_cr, h->mb_uvlinesize, chroma_height, mx&7, my&7);
+    chroma_op(dest_cr, src_cr, h->mb_uvlinesize, chroma_height << !!(CHROMA422), mx&7, (my << !!(CHROMA422)) &7);
 }
 
 static inline void mc_part_std(H264Context *h, int n, int square, int chroma_height, int delta,
@@ -537,6 +541,9 @@ static inline void mc_part_std(H264Context *h, int n, int square, int chroma_hei
     if(chroma444){
         dest_cb += (2*x_offset << pixel_shift) + 2*y_offset*h->mb_linesize;
         dest_cr += (2*x_offset << pixel_shift) + 2*y_offset*h->mb_linesize;
+    } else if (CHROMA422) {
+        dest_cb += (  x_offset << pixel_shift) + 2*y_offset*h->mb_uvlinesize;
+        dest_cr += (  x_offset << pixel_shift) + 2*y_offset*h->mb_uvlinesize;
     }else{
         dest_cb += (  x_offset << pixel_shift) +   y_offset*h->mb_uvlinesize;
         dest_cr += (  x_offset << pixel_shift) +   y_offset*h->mb_uvlinesize;
@@ -577,6 +584,9 @@ static inline void mc_part_weighted(H264Context *h, int n, int square, int chrom
         chroma_weight_op = luma_weight_op;
         dest_cb += (2*x_offset << pixel_shift) + 2*y_offset*h->mb_linesize;
         dest_cr += (2*x_offset << pixel_shift) + 2*y_offset*h->mb_linesize;
+    } else if (CHROMA422) {
+        dest_cb += (  x_offset << pixel_shift) + 2*y_offset*h->mb_uvlinesize;
+        dest_cr += (  x_offset << pixel_shift) + 2*y_offset*h->mb_uvlinesize;
     }else{
         dest_cb += (  x_offset << pixel_shift) +   y_offset*h->mb_uvlinesize;
         dest_cr += (  x_offset << pixel_shift) +   y_offset*h->mb_uvlinesize;
@@ -606,6 +616,14 @@ static inline void mc_part_weighted(H264Context *h, int n, int square, int chrom
             luma_weight_avg(  dest_y,  tmp_y,  h->  mb_linesize, 5, weight0, weight1, 0);
             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, 5, weight0, weight1, 0);
             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, 5, weight0, weight1, 0);
+            if (CHROMA422) {
+                chroma_weight_avg(dest_cb + chroma_height * h->mb_uvlinesize,
+                                  tmp_cb + chroma_height * h->mb_uvlinesize,
+                                  h->mb_uvlinesize, 5, weight0, weight1, 0);
+                chroma_weight_avg(dest_cr + chroma_height * h->mb_uvlinesize,
+                                  tmp_cr + chroma_height * h->mb_uvlinesize,
+                                  h->mb_uvlinesize, 5, weight0, weight1, 0);
+            }
         }else{
             luma_weight_avg(dest_y, tmp_y, h->mb_linesize, h->luma_log2_weight_denom,
                             h->luma_weight[refn0][0][0] , h->luma_weight[refn1][1][0],
@@ -616,6 +634,18 @@ static inline void mc_part_weighted(H264Context *h, int n, int square, int chrom
             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
                             h->chroma_weight[refn0][0][1][0] , h->chroma_weight[refn1][1][1][0],
                             h->chroma_weight[refn0][0][1][1] + h->chroma_weight[refn1][1][1][1]);
+            if (CHROMA422) {
+                chroma_weight_avg(dest_cb + chroma_height * h->mb_uvlinesize,
+                                  tmp_cb + chroma_height * h->mb_uvlinesize,
+                                  h->mb_uvlinesize, h->chroma_log2_weight_denom,
+                                  h->chroma_weight[refn0][0][0][0] , h->chroma_weight[refn1][1][0][0],
+                                  h->chroma_weight[refn0][0][0][1] + h->chroma_weight[refn1][1][0][1]);
+                chroma_weight_avg(dest_cr + chroma_height * h->mb_uvlinesize,
+                                  tmp_cr + chroma_height * h->mb_uvlinesize,
+                                  h->mb_uvlinesize, h->chroma_log2_weight_denom,
+                                  h->chroma_weight[refn0][0][1][0] , h->chroma_weight[refn1][1][1][0],
+                                  h->chroma_weight[refn0][0][1][1] + h->chroma_weight[refn1][1][1][1]);
+            }
         }
     }else{
         int list = list1 ? 1 : 0;
@@ -632,6 +662,14 @@ static inline void mc_part_weighted(H264Context *h, int n, int square, int chrom
                              h->chroma_weight[refn][list][0][0], h->chroma_weight[refn][list][0][1]);
             chroma_weight_op(dest_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
                              h->chroma_weight[refn][list][1][0], h->chroma_weight[refn][list][1][1]);
+            if (CHROMA422) {
+                chroma_weight_op(dest_cb + chroma_height * h->mb_uvlinesize,
+                                 h->mb_uvlinesize, h->chroma_log2_weight_denom,
+                                 h->chroma_weight[refn][list][0][0], h->chroma_weight[refn][list][0][1]);
+                chroma_weight_op(dest_cr + chroma_height * h->mb_uvlinesize,
+                                 h->mb_uvlinesize, h->chroma_log2_weight_denom,
+                                 h->chroma_weight[refn][list][1][0], h->chroma_weight[refn][list][1][1]);
+            }
         }
     }
 }
@@ -1851,13 +1889,13 @@ static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple, i
             }
             if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
                 if (!h->sps.chroma_format_idc) {
-                    for (i = 0; i < 8; i++) {
+                    for (i = 0; i < block_h; i++) {
                         uint16_t *tmp_cb = (uint16_t*)(dest_cb + i*uvlinesize);
                         for (j = 0; j < 8; j++) {
                             tmp_cb[j] = 1 << (bit_depth - 1);
                         }
                     }
-                    for (i = 0; i < 8; i++) {
+                    for (i = 0; i < block_h; i++) {
                         uint16_t *tmp_cr = (uint16_t*)(dest_cr + i*uvlinesize);
                         for (j = 0; j < 8; j++) {
                             tmp_cr[j] = 1 << (bit_depth - 1);
@@ -1882,7 +1920,7 @@ static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple, i
             }
             if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
                 if (!h->sps.chroma_format_idc) {
-                    for (i = 0; i < 8; i++) {
+                    for (i = 0; i < block_h; i++) {
                         memset(dest_cb + i*uvlinesize, 128, 8);
                         memset(dest_cr + i*uvlinesize, 128, 8);
                     }
@@ -1931,6 +1969,12 @@ static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple, i
                             if(h->non_zero_count_cache[ scan8[i] ] || dctcoef_get(h->mb, pixel_shift, i*16))
                                 idct_add   (dest[j-1] + block_offset[i], h->mb + (i*16 << pixel_shift), uvlinesize);
                         }
+                        if (CHROMA422) {
+                            for(i=j*16+4; i<j*16+8; i++){
+                                if(h->non_zero_count_cache[ scan8[i] ] || dctcoef_get(h->mb, pixel_shift, i*16))
+                                    idct_add   (dest[j-1] + block_offset[i+4], h->mb + (i*16 << pixel_shift), uvlinesize);
+                            }
+                        }
                     }
                 }
             }else{
diff --git a/libavcodec/h264pred.c b/libavcodec/h264pred.c
index 17199d01e6..37a4cf1486 100644
--- a/libavcodec/h264pred.c
+++ b/libavcodec/h264pred.c
@@ -462,10 +462,10 @@ void ff_h264_pred_init(H264PredContext *h, int codec_id, const int bit_depth, co
             h->pred8x8[DC_PRED8x8     ]= FUNCC(pred8x16_dc                    , depth);\
             h->pred8x8[LEFT_DC_PRED8x8]= FUNCC(pred8x16_left_dc               , depth);\
             h->pred8x8[TOP_DC_PRED8x8 ]= FUNCC(pred8x16_top_dc                , depth);\
-            h->pred8x8[ALZHEIMER_DC_L0T_PRED8x8 ]= FUNC(pred8x8_mad_cow_dc_l0t, depth);\
-            h->pred8x8[ALZHEIMER_DC_0LT_PRED8x8 ]= FUNC(pred8x8_mad_cow_dc_0lt, depth);\
-            h->pred8x8[ALZHEIMER_DC_L00_PRED8x8 ]= FUNC(pred8x8_mad_cow_dc_l00, depth);\
-            h->pred8x8[ALZHEIMER_DC_0L0_PRED8x8 ]= FUNC(pred8x8_mad_cow_dc_0l0, depth);\
+            h->pred8x8[ALZHEIMER_DC_L0T_PRED8x8 ]= FUNC(pred8x16_mad_cow_dc_l0t, depth);\
+            h->pred8x8[ALZHEIMER_DC_0LT_PRED8x8 ]= FUNC(pred8x16_mad_cow_dc_0lt, depth);\
+            h->pred8x8[ALZHEIMER_DC_L00_PRED8x8 ]= FUNC(pred8x16_mad_cow_dc_l00, depth);\
+            h->pred8x8[ALZHEIMER_DC_0L0_PRED8x8 ]= FUNC(pred8x16_mad_cow_dc_0l0, depth);\
         }\
     }else{\
         h->pred8x8[DC_PRED8x8     ]= FUNCD(pred8x8_dc_rv40);\
@@ -510,8 +510,13 @@ void ff_h264_pred_init(H264PredContext *h, int codec_id, const int bit_depth, co
     h->pred4x4_add  [ HOR_PRED   ]= FUNCC(pred4x4_horizontal_add          , depth);\
     h->pred8x8l_add [VERT_PRED   ]= FUNCC(pred8x8l_vertical_add           , depth);\
     h->pred8x8l_add [ HOR_PRED   ]= FUNCC(pred8x8l_horizontal_add         , depth);\
+    if (chroma_format_idc == 1) {\
     h->pred8x8_add  [VERT_PRED8x8]= FUNCC(pred8x8_vertical_add            , depth);\
     h->pred8x8_add  [ HOR_PRED8x8]= FUNCC(pred8x8_horizontal_add          , depth);\
+    } else {\
+        h->pred8x8_add  [VERT_PRED8x8]= FUNCC(pred8x16_vertical_add            , depth);\
+        h->pred8x8_add  [ HOR_PRED8x8]= FUNCC(pred8x16_horizontal_add          , depth);\
+    }\
     h->pred16x16_add[VERT_PRED8x8]= FUNCC(pred16x16_vertical_add          , depth);\
     h->pred16x16_add[ HOR_PRED8x8]= FUNCC(pred16x16_horizontal_add        , depth);\
 
diff --git a/libavcodec/h264pred_template.c b/libavcodec/h264pred_template.c
index d4f654e18c..318b56196d 100644
--- a/libavcodec/h264pred_template.c
+++ b/libavcodec/h264pred_template.c
@@ -657,29 +657,50 @@ static void FUNCC(pred8x16_dc)(uint8_t *_src, int stride){
     }
 }
 
-//the following 4 function should not be optimized!
 static void FUNC(pred8x8_mad_cow_dc_l0t)(uint8_t *src, int stride){
     FUNCC(pred8x8_top_dc)(src, stride);
     FUNCC(pred4x4_dc)(src, NULL, stride);
 }
 
+static void FUNC(pred8x16_mad_cow_dc_l0t)(uint8_t *src, int stride){
+    FUNCC(pred8x16_top_dc)(src, stride);
+    FUNCC(pred4x4_dc)(src, NULL, stride);
+}
+
 static void FUNC(pred8x8_mad_cow_dc_0lt)(uint8_t *src, int stride){
     FUNCC(pred8x8_dc)(src, stride);
     FUNCC(pred4x4_top_dc)(src, NULL, stride);
 }
 
+static void FUNC(pred8x16_mad_cow_dc_0lt)(uint8_t *src, int stride){
+    FUNCC(pred8x16_dc)(src, stride);
+    FUNCC(pred4x4_top_dc)(src, NULL, stride);
+}
+
 static void FUNC(pred8x8_mad_cow_dc_l00)(uint8_t *src, int stride){
     FUNCC(pred8x8_left_dc)(src, stride);
     FUNCC(pred4x4_128_dc)(src + 4*stride                  , NULL, stride);
     FUNCC(pred4x4_128_dc)(src + 4*stride + 4*sizeof(pixel), NULL, stride);
 }
 
+static void FUNC(pred8x16_mad_cow_dc_l00)(uint8_t *src, int stride){
+    FUNCC(pred8x16_left_dc)(src, stride);
+    FUNCC(pred4x4_128_dc)(src + 4*stride                  , NULL, stride);
+    FUNCC(pred4x4_128_dc)(src + 4*stride + 4*sizeof(pixel), NULL, stride);
+}
+
 static void FUNC(pred8x8_mad_cow_dc_0l0)(uint8_t *src, int stride){
     FUNCC(pred8x8_left_dc)(src, stride);
     FUNCC(pred4x4_128_dc)(src                  , NULL, stride);
     FUNCC(pred4x4_128_dc)(src + 4*sizeof(pixel), NULL, stride);
 }
 
+static void FUNC(pred8x16_mad_cow_dc_0l0)(uint8_t *src, int stride){
+    FUNCC(pred8x16_left_dc)(src, stride);
+    FUNCC(pred4x4_128_dc)(src                  , NULL, stride);
+    FUNCC(pred4x4_128_dc)(src + 4*sizeof(pixel), NULL, stride);
+}
+
 static void FUNCC(pred8x8_plane)(uint8_t *_src, int _stride){
   int j, k;
   int a;
@@ -1126,8 +1147,24 @@ static void FUNCC(pred8x8_vertical_add)(uint8_t *pix, const int *block_offset, c
         FUNCC(pred4x4_vertical_add)(pix + block_offset[i], block + i*16*sizeof(pixel), stride);
 }
 
+static void FUNCC(pred8x16_vertical_add)(uint8_t *pix, const int *block_offset, const DCTELEM *block, int stride){
+    int i;
+    for(i=0; i<4; i++)
+        FUNCC(pred4x4_vertical_add)(pix + block_offset[i], block + i*16*sizeof(pixel), stride);
+    for(i=4; i<8; i++)
+        FUNCC(pred4x4_vertical_add)(pix + block_offset[i+4], block + i*16*sizeof(pixel), stride);
+}
+
 static void FUNCC(pred8x8_horizontal_add)(uint8_t *pix, const int *block_offset, const DCTELEM *block, int stride){
     int i;
     for(i=0; i<4; i++)
         FUNCC(pred4x4_horizontal_add)(pix + block_offset[i], block + i*16*sizeof(pixel), stride);
 }
+
+static void FUNCC(pred8x16_horizontal_add)(uint8_t *pix, const int *block_offset, const DCTELEM *block, int stride){
+    int i;
+    for(i=0; i<4; i++)
+        FUNCC(pred4x4_horizontal_add)(pix + block_offset[i], block + i*16*sizeof(pixel), stride);
+    for(i=4; i<8; i++)
+        FUNCC(pred4x4_horizontal_add)(pix + block_offset[i+4], block + i*16*sizeof(pixel), stride);
+}
diff --git a/libavcodec/x86/h264dsp_mmx.c b/libavcodec/x86/h264dsp_mmx.c
index 910ad8401f..06ee7cad43 100644
--- a/libavcodec/x86/h264dsp_mmx.c
+++ b/libavcodec/x86/h264dsp_mmx.c
@@ -354,7 +354,7 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, const int chrom
 {
     int mm_flags = av_get_cpu_flags();
 
-    if (mm_flags & AV_CPU_FLAG_MMX2) {
+    if (chroma_format_idc == 1 && mm_flags & AV_CPU_FLAG_MMX2) {
         c->h264_loop_filter_strength= h264_loop_filter_strength_mmx2;
     }
 

From c2d337429c7c87ee559efe54dbc0f84f2a25c3a4 Mon Sep 17 00:00:00 2001
From: "Ronald S. Bultje" <rsbultje@gmail.com>
Date: Fri, 21 Oct 2011 00:00:39 -0700
Subject: [PATCH 07/35] H264: change weight/biweight functions to take a height
 argument.

Neon parts by Mans Rullgard <mans@mansr.com>.
---
 libavcodec/arm/h264dsp_init_arm.c    |  77 +++-------
 libavcodec/arm/h264dsp_neon.S        |  86 ++++-------
 libavcodec/h264.c                    | 126 +++++++---------
 libavcodec/h264dsp.c                 |  28 +---
 libavcodec/h264dsp.h                 |  10 +-
 libavcodec/h264dsp_template.c        |  28 ++--
 libavcodec/ppc/h264_altivec.c        |  44 +++---
 libavcodec/x86/h264_weight.asm       | 210 ++++++++++-----------------
 libavcodec/x86/h264_weight_10bit.asm | 145 +++++++-----------
 libavcodec/x86/h264dsp_mmx.c         | 175 ++++++++--------------
 10 files changed, 337 insertions(+), 592 deletions(-)

diff --git a/libavcodec/arm/h264dsp_init_arm.c b/libavcodec/arm/h264dsp_init_arm.c
index c1ca217add..1c331a495d 100644
--- a/libavcodec/arm/h264dsp_init_arm.c
+++ b/libavcodec/arm/h264dsp_init_arm.c
@@ -32,47 +32,22 @@ void ff_h264_v_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha,
 void ff_h264_h_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha,
                                        int beta, int8_t *tc0);
 
-void ff_weight_h264_pixels_16x16_neon(uint8_t *ds, int stride, int log2_den,
-                                      int weight, int offset);
-void ff_weight_h264_pixels_16x8_neon(uint8_t *ds, int stride, int log2_den,
-                                     int weight, int offset);
-void ff_weight_h264_pixels_8x16_neon(uint8_t *ds, int stride, int log2_den,
-                                     int weight, int offset);
-void ff_weight_h264_pixels_8x8_neon(uint8_t *ds, int stride, int log2_den,
-                                    int weight, int offset);
-void ff_weight_h264_pixels_8x4_neon(uint8_t *ds, int stride, int log2_den,
-                                    int weight, int offset);
-void ff_weight_h264_pixels_4x8_neon(uint8_t *ds, int stride, int log2_den,
-                                    int weight, int offset);
-void ff_weight_h264_pixels_4x4_neon(uint8_t *ds, int stride, int log2_den,
-                                    int weight, int offset);
-void ff_weight_h264_pixels_4x2_neon(uint8_t *ds, int stride, int log2_den,
-                                    int weight, int offset);
+void ff_weight_h264_pixels_16_neon(uint8_t *dst, int stride, int height,
+                                   int log2_den, int weight, int offset);
+void ff_weight_h264_pixels_8_neon(uint8_t *dst, int stride, int height,
+                                  int log2_den, int weight, int offset);
+void ff_weight_h264_pixels_4_neon(uint8_t *dst, int stride, int height,
+                                  int log2_den, int weight, int offset);
 
-void ff_biweight_h264_pixels_16x16_neon(uint8_t *dst, uint8_t *src, int stride,
-                                        int log2_den, int weightd, int weights,
-                                        int offset);
-void ff_biweight_h264_pixels_16x8_neon(uint8_t *dst, uint8_t *src, int stride,
-                                       int log2_den, int weightd, int weights,
-                                       int offset);
-void ff_biweight_h264_pixels_8x16_neon(uint8_t *dst, uint8_t *src, int stride,
-                                       int log2_den, int weightd, int weights,
-                                       int offset);
-void ff_biweight_h264_pixels_8x8_neon(uint8_t *dst, uint8_t *src, int stride,
-                                      int log2_den, int weightd, int weights,
-                                      int offset);
-void ff_biweight_h264_pixels_8x4_neon(uint8_t *dst, uint8_t *src, int stride,
-                                      int log2_den, int weightd, int weights,
-                                      int offset);
-void ff_biweight_h264_pixels_4x8_neon(uint8_t *dst, uint8_t *src, int stride,
-                                      int log2_den, int weightd, int weights,
-                                      int offset);
-void ff_biweight_h264_pixels_4x4_neon(uint8_t *dst, uint8_t *src, int stride,
-                                      int log2_den, int weightd, int weights,
-                                      int offset);
-void ff_biweight_h264_pixels_4x2_neon(uint8_t *dst, uint8_t *src, int stride,
-                                      int log2_den, int weightd, int weights,
-                                      int offset);
+void ff_biweight_h264_pixels_16_neon(uint8_t *dst, uint8_t *src, int stride,
+                                     int height, int log2_den, int weightd,
+                                     int weights, int offset);
+void ff_biweight_h264_pixels_8_neon(uint8_t *dst, uint8_t *src, int stride,
+                                    int height, int log2_den, int weightd,
+                                    int weights, int offset);
+void ff_biweight_h264_pixels_4_neon(uint8_t *dst, uint8_t *src, int stride,
+                                    int height, int log2_den, int weightd,
+                                    int weights, int offset);
 
 void ff_h264_idct_add_neon(uint8_t *dst, DCTELEM *block, int stride);
 void ff_h264_idct_dc_add_neon(uint8_t *dst, DCTELEM *block, int stride);
@@ -100,23 +75,13 @@ static void ff_h264dsp_init_neon(H264DSPContext *c, const int bit_depth, const i
     c->h264_v_loop_filter_chroma = ff_h264_v_loop_filter_chroma_neon;
     c->h264_h_loop_filter_chroma = ff_h264_h_loop_filter_chroma_neon;
 
-    c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels_16x16_neon;
-    c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels_16x8_neon;
-    c->weight_h264_pixels_tab[2] = ff_weight_h264_pixels_8x16_neon;
-    c->weight_h264_pixels_tab[3] = ff_weight_h264_pixels_8x8_neon;
-    c->weight_h264_pixels_tab[4] = ff_weight_h264_pixels_8x4_neon;
-    c->weight_h264_pixels_tab[5] = ff_weight_h264_pixels_4x8_neon;
-    c->weight_h264_pixels_tab[6] = ff_weight_h264_pixels_4x4_neon;
-    c->weight_h264_pixels_tab[7] = ff_weight_h264_pixels_4x2_neon;
+    c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels_16_neon;
+    c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels_8_neon;
+    c->weight_h264_pixels_tab[2] = ff_weight_h264_pixels_4_neon;
 
-    c->biweight_h264_pixels_tab[0] = ff_biweight_h264_pixels_16x16_neon;
-    c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels_16x8_neon;
-    c->biweight_h264_pixels_tab[2] = ff_biweight_h264_pixels_8x16_neon;
-    c->biweight_h264_pixels_tab[3] = ff_biweight_h264_pixels_8x8_neon;
-    c->biweight_h264_pixels_tab[4] = ff_biweight_h264_pixels_8x4_neon;
-    c->biweight_h264_pixels_tab[5] = ff_biweight_h264_pixels_4x8_neon;
-    c->biweight_h264_pixels_tab[6] = ff_biweight_h264_pixels_4x4_neon;
-    c->biweight_h264_pixels_tab[7] = ff_biweight_h264_pixels_4x2_neon;
+    c->biweight_h264_pixels_tab[0] = ff_biweight_h264_pixels_16_neon;
+    c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels_8_neon;
+    c->biweight_h264_pixels_tab[2] = ff_biweight_h264_pixels_4_neon;
 
     c->h264_idct_add        = ff_h264_idct_add_neon;
     c->h264_idct_dc_add     = ff_h264_idct_dc_add_neon;
diff --git a/libavcodec/arm/h264dsp_neon.S b/libavcodec/arm/h264dsp_neon.S
index 0fa4a6b0a5..3d2c6746ae 100644
--- a/libavcodec/arm/h264dsp_neon.S
+++ b/libavcodec/arm/h264dsp_neon.S
@@ -1592,7 +1592,7 @@ endfunc
         vdup.8          d1,  r5
         vmov            q2,  q8
         vmov            q3,  q8
-1:      subs            ip,  ip,  #2
+1:      subs            r3,  r3,  #2
         vld1.8          {d20-d21},[r0,:128], r2
         \macd           q2,  d0,  d20
         pld             [r0]
@@ -1632,7 +1632,7 @@ endfunc
         vdup.8          d1,  r5
         vmov            q1,  q8
         vmov            q10, q8
-1:      subs            ip,  ip,  #2
+1:      subs            r3,  r3,  #2
         vld1.8          {d4},[r0,:64], r2
         \macd           q1,  d0,  d4
         pld             [r0]
@@ -1662,7 +1662,7 @@ endfunc
         vdup.8          d1,  r5
         vmov            q1,  q8
         vmov            q10, q8
-1:      subs            ip,  ip,  #4
+1:      subs            r3,  r3,  #4
         vld1.32         {d4[0]},[r0,:32], r2
         vld1.32         {d4[1]},[r0,:32], r2
         \macd           q1,  d0,  d4
@@ -1700,16 +1700,17 @@ endfunc
         .endm
 
         .macro  biweight_func w
-function biweight_h264_pixels_\w\()_neon
+function ff_biweight_h264_pixels_\w\()_neon, export=1
         push            {r4-r6, lr}
-        add             r4,  sp,  #16
+        ldr             r12, [sp, #16]
+        add             r4,  sp,  #20
         ldm             r4,  {r4-r6}
         lsr             lr,  r4,  #31
         add             r6,  r6,  #1
         eors            lr,  lr,  r5,  lsr #30
         orr             r6,  r6,  #1
-        vdup.16         q9,  r3
-        lsl             r6,  r6,  r3
+        vdup.16         q9,  r12
+        lsl             r6,  r6,  r12
         vmvn            q9,  q9
         vdup.16         q8,  r6
         mov             r6,  r0
@@ -1730,34 +1731,15 @@ function biweight_h264_pixels_\w\()_neon
 endfunc
         .endm
 
-        .macro  biweight_entry w, h, b=1
-function ff_biweight_h264_pixels_\w\()x\h\()_neon, export=1
-        mov             ip,  #\h
-.if \b
-        b               biweight_h264_pixels_\w\()_neon
-.endif
-endfunc
-        .endm
-
-        biweight_entry  16, 8
-        biweight_entry  16, 16, b=0
         biweight_func   16
-
-        biweight_entry  8,  16
-        biweight_entry  8,  4
-        biweight_entry  8,  8,  b=0
         biweight_func   8
-
-        biweight_entry  4,  8
-        biweight_entry  4,  2
-        biweight_entry  4,  4,  b=0
         biweight_func   4
 
 @ Weighted prediction
 
         .macro  weight_16 add
-        vdup.8          d0,  r3
-1:      subs            ip,  ip,  #2
+        vdup.8          d0,  r12
+1:      subs            r2,  r2,  #2
         vld1.8          {d20-d21},[r0,:128], r1
         vmull.u8        q2,  d0,  d20
         pld             [r0]
@@ -1785,8 +1767,8 @@ endfunc
         .endm
 
         .macro  weight_8 add
-        vdup.8          d0,  r3
-1:      subs            ip,  ip,  #2
+        vdup.8          d0,  r12
+1:      subs            r2,  r2,  #2
         vld1.8          {d4},[r0,:64], r1
         vmull.u8        q1,  d0,  d4
         pld             [r0]
@@ -1806,10 +1788,10 @@ endfunc
         .endm
 
         .macro  weight_4 add
-        vdup.8          d0,  r3
+        vdup.8          d0,  r12
         vmov            q1,  q8
         vmov            q10, q8
-1:      subs            ip,  ip,  #4
+1:      subs            r2,  r2,  #4
         vld1.32         {d4[0]},[r0,:32], r1
         vld1.32         {d4[1]},[r0,:32], r1
         vmull.u8        q1,  d0,  d4
@@ -1842,50 +1824,32 @@ endfunc
         .endm
 
         .macro  weight_func w
-function weight_h264_pixels_\w\()_neon
+function ff_weight_h264_pixels_\w\()_neon, export=1
         push            {r4, lr}
-        ldr             r4,  [sp, #8]
-        cmp             r2,  #1
-        lsl             r4,  r4,  r2
+        ldr             r12, [sp, #8]
+        ldr             r4,  [sp, #12]
+        cmp             r3,  #1
+        lsl             r4,  r4,  r3
         vdup.16         q8,  r4
         mov             r4,  r0
         ble             20f
-        rsb             lr,  r2,  #1
+        rsb             lr,  r3,  #1
         vdup.16         q9,  lr
-        cmp             r3,  #0
+        cmp             r12, #0
         blt             10f
         weight_\w       vhadd.s16
-10:     rsb             r3,  r3,  #0
+10:     rsb             r12, r12, #0
         weight_\w       vhsub.s16
-20:     rsb             lr,  r2,  #0
+20:     rsb             lr,  r3,  #0
         vdup.16         q9,  lr
-        cmp             r3,  #0
+        cmp             r12, #0
         blt             10f
         weight_\w       vadd.s16
-10:     rsb             r3,  r3,  #0
+10:     rsb             r12, r12, #0
         weight_\w       vsub.s16
 endfunc
         .endm
 
-        .macro  weight_entry w, h, b=1
-function ff_weight_h264_pixels_\w\()x\h\()_neon, export=1
-        mov             ip,  #\h
-.if \b
-        b               weight_h264_pixels_\w\()_neon
-.endif
-endfunc
-        .endm
-
-        weight_entry    16, 8
-        weight_entry    16, 16, b=0
         weight_func     16
-
-        weight_entry    8,  16
-        weight_entry    8,  4
-        weight_entry    8,  8,  b=0
         weight_func     8
-
-        weight_entry    4,  8
-        weight_entry    4,  2
-        weight_entry    4,  4,  b=0
         weight_func     4
diff --git a/libavcodec/h264.c b/libavcodec/h264.c
index 8d652f13ce..7306828197 100644
--- a/libavcodec/h264.c
+++ b/libavcodec/h264.c
@@ -438,7 +438,8 @@ static void chroma_dc_dct_c(DCTELEM *block){
 }
 #endif
 
-static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square, int chroma_height, int delta, int list,
+static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square,
+                               int height, int delta, int list,
                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
                            int src_x_offset, int src_y_offset,
                            qpel_mc_func *qpix_op, h264_chroma_mc_func chroma_op,
@@ -518,16 +519,16 @@ static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square,
         s->dsp.emulated_edge_mc(s->edge_emu_buffer, src_cb, h->mb_uvlinesize, 9, (16>>!(CHROMA422)) + 1, (mx>>3), (my>>ysh), pic_width>>1, pic_height>>!(CHROMA422));
             src_cb= s->edge_emu_buffer;
     }
-    chroma_op(dest_cb, src_cb, h->mb_uvlinesize, chroma_height << !!(CHROMA422), mx&7, (my << !!(CHROMA422)) &7);
+    chroma_op(dest_cb, src_cb, h->mb_uvlinesize, height >> !(CHROMA422), mx&7, (my << !!(CHROMA422)) &7);
 
     if(emu){
         s->dsp.emulated_edge_mc(s->edge_emu_buffer, src_cr, h->mb_uvlinesize, 9, (16>>!(CHROMA422)) + 1, (mx>>3), (my>>ysh), pic_width>>1, pic_height>>!(CHROMA422));
             src_cr= s->edge_emu_buffer;
     }
-    chroma_op(dest_cr, src_cr, h->mb_uvlinesize, chroma_height << !!(CHROMA422), mx&7, (my << !!(CHROMA422)) &7);
+    chroma_op(dest_cr, src_cr, h->mb_uvlinesize, height >> !(CHROMA422), mx&7, (my << !!(CHROMA422)) &7);
 }
 
-static inline void mc_part_std(H264Context *h, int n, int square, int chroma_height, int delta,
+static inline void mc_part_std(H264Context *h, int n, int square, int height, int delta,
                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
                            int x_offset, int y_offset,
                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
@@ -553,7 +554,7 @@ static inline void mc_part_std(H264Context *h, int n, int square, int chroma_hei
 
     if(list0){
         Picture *ref= &h->ref_list[0][ h->ref_cache[0][ scan8[n] ] ];
-        mc_dir_part(h, ref, n, square, chroma_height, delta, 0,
+        mc_dir_part(h, ref, n, square, height, delta, 0,
                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
                            qpix_op, chroma_op, pixel_shift, chroma444);
 
@@ -563,13 +564,13 @@ static inline void mc_part_std(H264Context *h, int n, int square, int chroma_hei
 
     if(list1){
         Picture *ref= &h->ref_list[1][ h->ref_cache[1][ scan8[n] ] ];
-        mc_dir_part(h, ref, n, square, chroma_height, delta, 1,
+        mc_dir_part(h, ref, n, square, height, delta, 1,
                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
                            qpix_op, chroma_op, pixel_shift, chroma444);
     }
 }
 
-static inline void mc_part_weighted(H264Context *h, int n, int square, int chroma_height, int delta,
+static inline void mc_part_weighted(H264Context *h, int n, int square, int height, int delta,
                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
                            int x_offset, int y_offset,
                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
@@ -577,17 +578,21 @@ static inline void mc_part_weighted(H264Context *h, int n, int square, int chrom
                            h264_biweight_func luma_weight_avg, h264_biweight_func chroma_weight_avg,
                            int list0, int list1, int pixel_shift, int chroma444){
     MpegEncContext * const s = &h->s;
+    int chroma_height;
 
     dest_y += (2*x_offset << pixel_shift) + 2*y_offset*h->mb_linesize;
     if(chroma444){
+        chroma_height = height;
         chroma_weight_avg = luma_weight_avg;
         chroma_weight_op = luma_weight_op;
         dest_cb += (2*x_offset << pixel_shift) + 2*y_offset*h->mb_linesize;
         dest_cr += (2*x_offset << pixel_shift) + 2*y_offset*h->mb_linesize;
     } else if (CHROMA422) {
+        chroma_height = height;
         dest_cb += (  x_offset << pixel_shift) + 2*y_offset*h->mb_uvlinesize;
         dest_cr += (  x_offset << pixel_shift) + 2*y_offset*h->mb_uvlinesize;
     }else{
+        chroma_height = height >> 1;
         dest_cb += (  x_offset << pixel_shift) +   y_offset*h->mb_uvlinesize;
         dest_cr += (  x_offset << pixel_shift) +   y_offset*h->mb_uvlinesize;
     }
@@ -603,78 +608,53 @@ static inline void mc_part_weighted(H264Context *h, int n, int square, int chrom
         int refn0 = h->ref_cache[0][ scan8[n] ];
         int refn1 = h->ref_cache[1][ scan8[n] ];
 
-        mc_dir_part(h, &h->ref_list[0][refn0], n, square, chroma_height, delta, 0,
+        mc_dir_part(h, &h->ref_list[0][refn0], n, square, height, delta, 0,
                     dest_y, dest_cb, dest_cr,
                     x_offset, y_offset, qpix_put, chroma_put, pixel_shift, chroma444);
-        mc_dir_part(h, &h->ref_list[1][refn1], n, square, chroma_height, delta, 1,
+        mc_dir_part(h, &h->ref_list[1][refn1], n, square, height, delta, 1,
                     tmp_y, tmp_cb, tmp_cr,
                     x_offset, y_offset, qpix_put, chroma_put, pixel_shift, chroma444);
 
         if(h->use_weight == 2){
             int weight0 = h->implicit_weight[refn0][refn1][s->mb_y&1];
             int weight1 = 64 - weight0;
-            luma_weight_avg(  dest_y,  tmp_y,  h->  mb_linesize, 5, weight0, weight1, 0);
-            chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, 5, weight0, weight1, 0);
-            chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, 5, weight0, weight1, 0);
-            if (CHROMA422) {
-                chroma_weight_avg(dest_cb + chroma_height * h->mb_uvlinesize,
-                                  tmp_cb + chroma_height * h->mb_uvlinesize,
-                                  h->mb_uvlinesize, 5, weight0, weight1, 0);
-                chroma_weight_avg(dest_cr + chroma_height * h->mb_uvlinesize,
-                                  tmp_cr + chroma_height * h->mb_uvlinesize,
-                                  h->mb_uvlinesize, 5, weight0, weight1, 0);
-            }
+            luma_weight_avg(  dest_y,  tmp_y,  h->  mb_linesize,
+                              height,        5, weight0, weight1, 0);
+            chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize,
+                              chroma_height, 5, weight0, weight1, 0);
+            chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize,
+                              chroma_height, 5, weight0, weight1, 0);
         }else{
-            luma_weight_avg(dest_y, tmp_y, h->mb_linesize, h->luma_log2_weight_denom,
+            luma_weight_avg(dest_y, tmp_y, h->mb_linesize, height, h->luma_log2_weight_denom,
                             h->luma_weight[refn0][0][0] , h->luma_weight[refn1][1][0],
                             h->luma_weight[refn0][0][1] + h->luma_weight[refn1][1][1]);
-            chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
+            chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, chroma_height, h->chroma_log2_weight_denom,
                             h->chroma_weight[refn0][0][0][0] , h->chroma_weight[refn1][1][0][0],
                             h->chroma_weight[refn0][0][0][1] + h->chroma_weight[refn1][1][0][1]);
-            chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
+            chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, chroma_height, h->chroma_log2_weight_denom,
                             h->chroma_weight[refn0][0][1][0] , h->chroma_weight[refn1][1][1][0],
                             h->chroma_weight[refn0][0][1][1] + h->chroma_weight[refn1][1][1][1]);
-            if (CHROMA422) {
-                chroma_weight_avg(dest_cb + chroma_height * h->mb_uvlinesize,
-                                  tmp_cb + chroma_height * h->mb_uvlinesize,
-                                  h->mb_uvlinesize, h->chroma_log2_weight_denom,
-                                  h->chroma_weight[refn0][0][0][0] , h->chroma_weight[refn1][1][0][0],
-                                  h->chroma_weight[refn0][0][0][1] + h->chroma_weight[refn1][1][0][1]);
-                chroma_weight_avg(dest_cr + chroma_height * h->mb_uvlinesize,
-                                  tmp_cr + chroma_height * h->mb_uvlinesize,
-                                  h->mb_uvlinesize, h->chroma_log2_weight_denom,
-                                  h->chroma_weight[refn0][0][1][0] , h->chroma_weight[refn1][1][1][0],
-                                  h->chroma_weight[refn0][0][1][1] + h->chroma_weight[refn1][1][1][1]);
-            }
         }
     }else{
         int list = list1 ? 1 : 0;
         int refn = h->ref_cache[list][ scan8[n] ];
         Picture *ref= &h->ref_list[list][refn];
-        mc_dir_part(h, ref, n, square, chroma_height, delta, list,
+        mc_dir_part(h, ref, n, square, height, delta, list,
                     dest_y, dest_cb, dest_cr, x_offset, y_offset,
                     qpix_put, chroma_put, pixel_shift, chroma444);
 
-        luma_weight_op(dest_y, h->mb_linesize, h->luma_log2_weight_denom,
+        luma_weight_op(dest_y, h->mb_linesize, height, h->luma_log2_weight_denom,
                        h->luma_weight[refn][list][0], h->luma_weight[refn][list][1]);
         if(h->use_weight_chroma){
-            chroma_weight_op(dest_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
+            chroma_weight_op(dest_cb, h->mb_uvlinesize, chroma_height, h->chroma_log2_weight_denom,
                              h->chroma_weight[refn][list][0][0], h->chroma_weight[refn][list][0][1]);
-            chroma_weight_op(dest_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
+            chroma_weight_op(dest_cr, h->mb_uvlinesize, chroma_height, h->chroma_log2_weight_denom,
                              h->chroma_weight[refn][list][1][0], h->chroma_weight[refn][list][1][1]);
-            if (CHROMA422) {
-                chroma_weight_op(dest_cb + chroma_height * h->mb_uvlinesize,
-                                 h->mb_uvlinesize, h->chroma_log2_weight_denom,
-                                 h->chroma_weight[refn][list][0][0], h->chroma_weight[refn][list][0][1]);
-                chroma_weight_op(dest_cr + chroma_height * h->mb_uvlinesize,
-                                 h->mb_uvlinesize, h->chroma_log2_weight_denom,
-                                 h->chroma_weight[refn][list][1][0], h->chroma_weight[refn][list][1][1]);
-            }
         }
     }
 }
 
-static inline void mc_part(H264Context *h, int n, int square, int chroma_height, int delta,
+static inline void mc_part(H264Context *h, int n, int square, int height, int delta,
                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
                            int x_offset, int y_offset,
                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
@@ -684,12 +664,12 @@ static inline void mc_part(H264Context *h, int n, int square, int chroma_height,
     if((h->use_weight==2 && list0 && list1
         && (h->implicit_weight[ h->ref_cache[0][scan8[n]] ][ h->ref_cache[1][scan8[n]] ][h->s.mb_y&1] != 32))
        || h->use_weight==1)
-        mc_part_weighted(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
+        mc_part_weighted(h, n, square, height, delta, dest_y, dest_cb, dest_cr,
                          x_offset, y_offset, qpix_put, chroma_put,
-                         weight_op[0], weight_op[3], weight_avg[0],
-                         weight_avg[3], list0, list1, pixel_shift, chroma444);
+                         weight_op[0], weight_op[1], weight_avg[0],
+                         weight_avg[1], list0, list1, pixel_shift, chroma444);
     else
-        mc_part_std(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
+        mc_part_std(h, n, square, height, delta, dest_y, dest_cb, dest_cr,
                     x_offset, y_offset, qpix_put, chroma_put, qpix_avg,
                     chroma_avg, list0, list1, pixel_shift, chroma444);
 }
@@ -731,31 +711,31 @@ static av_always_inline void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t
     prefetch_motion(h, 0, pixel_shift, chroma444);
 
     if(IS_16X16(mb_type)){
-        mc_part(h, 0, 1, 8, 0, dest_y, dest_cb, dest_cr, 0, 0,
+        mc_part(h, 0, 1, 16, 0, dest_y, dest_cb, dest_cr, 0, 0,
                 qpix_put[0], chroma_put[0], qpix_avg[0], chroma_avg[0],
                 weight_op, weight_avg,
                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1),
                 pixel_shift, chroma444);
     }else if(IS_16X8(mb_type)){
-        mc_part(h, 0, 0, 4, 8 << pixel_shift, dest_y, dest_cb, dest_cr, 0, 0,
+        mc_part(h, 0, 0, 8, 8 << pixel_shift, dest_y, dest_cb, dest_cr, 0, 0,
                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
-                &weight_op[1], &weight_avg[1],
+                weight_op, weight_avg,
                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1),
                 pixel_shift, chroma444);
-        mc_part(h, 8, 0, 4, 8 << pixel_shift, dest_y, dest_cb, dest_cr, 0, 4,
+        mc_part(h, 8, 0, 8, 8 << pixel_shift, dest_y, dest_cb, dest_cr, 0, 4,
                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
-                &weight_op[1], &weight_avg[1],
+                weight_op, weight_avg,
                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1),
                 pixel_shift, chroma444);
     }else if(IS_8X16(mb_type)){
-        mc_part(h, 0, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 0, 0,
+        mc_part(h, 0, 0, 16, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 0, 0,
                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
-                &weight_op[2], &weight_avg[2],
+                &weight_op[1], &weight_avg[1],
                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1),
                 pixel_shift, chroma444);
-        mc_part(h, 4, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 4, 0,
+        mc_part(h, 4, 0, 16, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 4, 0,
                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
-                &weight_op[2], &weight_avg[2],
+                &weight_op[1], &weight_avg[1],
                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1),
                 pixel_shift, chroma444);
     }else{
@@ -770,31 +750,31 @@ static av_always_inline void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t
             int y_offset= (i&2)<<1;
 
             if(IS_SUB_8X8(sub_mb_type)){
-                mc_part(h, n, 1, 4, 0, dest_y, dest_cb, dest_cr, x_offset, y_offset,
+                mc_part(h, n, 1, 8, 0, dest_y, dest_cb, dest_cr, x_offset, y_offset,
                     qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
-                    &weight_op[3], &weight_avg[3],
+                    &weight_op[1], &weight_avg[1],
                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1),
                     pixel_shift, chroma444);
             }else if(IS_SUB_8X4(sub_mb_type)){
-                mc_part(h, n  , 0, 2, 4 << pixel_shift, dest_y, dest_cb, dest_cr, x_offset, y_offset,
+                mc_part(h, n  , 0, 4, 4 << pixel_shift, dest_y, dest_cb, dest_cr, x_offset, y_offset,
                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
-                    &weight_op[4], &weight_avg[4],
+                    &weight_op[1], &weight_avg[1],
                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1),
                     pixel_shift, chroma444);
-                mc_part(h, n+2, 0, 2, 4 << pixel_shift, dest_y, dest_cb, dest_cr, x_offset, y_offset+2,
+                mc_part(h, n+2, 0, 4, 4 << pixel_shift, dest_y, dest_cb, dest_cr, x_offset, y_offset+2,
                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
-                    &weight_op[4], &weight_avg[4],
+                    &weight_op[1], &weight_avg[1],
                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1),
                     pixel_shift, chroma444);
             }else if(IS_SUB_4X8(sub_mb_type)){
-                mc_part(h, n  , 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset, y_offset,
+                mc_part(h, n  , 0, 8, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset, y_offset,
                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
-                    &weight_op[5], &weight_avg[5],
+                    &weight_op[2], &weight_avg[2],
                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1),
                     pixel_shift, chroma444);
-                mc_part(h, n+1, 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset+2, y_offset,
+                mc_part(h, n+1, 0, 8, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset+2, y_offset,
                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
-                    &weight_op[5], &weight_avg[5],
+                    &weight_op[2], &weight_avg[2],
                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1),
                     pixel_shift, chroma444);
             }else{
@@ -803,9 +783,9 @@ static av_always_inline void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t
                 for(j=0; j<4; j++){
                     int sub_x_offset= x_offset + 2*(j&1);
                     int sub_y_offset= y_offset +   (j&2);
-                    mc_part(h, n+j, 1, 2, 0, dest_y, dest_cb, dest_cr, sub_x_offset, sub_y_offset,
+                    mc_part(h, n+j, 1, 4, 0, dest_y, dest_cb, dest_cr, sub_x_offset, sub_y_offset,
                         qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
-                        &weight_op[6], &weight_avg[6],
+                        &weight_op[2], &weight_avg[2],
                         IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1),
                         pixel_shift, chroma444);
                 }
diff --git a/libavcodec/h264dsp.c b/libavcodec/h264dsp.c
index 19ad2db3d9..ba967079fb 100644
--- a/libavcodec/h264dsp.c
+++ b/libavcodec/h264dsp.c
@@ -64,26 +64,14 @@ void ff_h264dsp_init(H264DSPContext *c, const int bit_depth, const int chroma_fo
     else\
         c->h264_chroma_dc_dequant_idct= FUNC(ff_h264_chroma422_dc_dequant_idct, depth);\
 \
-    c->weight_h264_pixels_tab[0]= FUNC(weight_h264_pixels16x16, depth);\
-    c->weight_h264_pixels_tab[1]= FUNC(weight_h264_pixels16x8, depth);\
-    c->weight_h264_pixels_tab[2]= FUNC(weight_h264_pixels8x16, depth);\
-    c->weight_h264_pixels_tab[3]= FUNC(weight_h264_pixels8x8, depth);\
-    c->weight_h264_pixels_tab[4]= FUNC(weight_h264_pixels8x4, depth);\
-    c->weight_h264_pixels_tab[5]= FUNC(weight_h264_pixels4x8, depth);\
-    c->weight_h264_pixels_tab[6]= FUNC(weight_h264_pixels4x4, depth);\
-    c->weight_h264_pixels_tab[7]= FUNC(weight_h264_pixels4x2, depth);\
-    c->weight_h264_pixels_tab[8]= FUNC(weight_h264_pixels2x4, depth);\
-    c->weight_h264_pixels_tab[9]= FUNC(weight_h264_pixels2x2, depth);\
-    c->biweight_h264_pixels_tab[0]= FUNC(biweight_h264_pixels16x16, depth);\
-    c->biweight_h264_pixels_tab[1]= FUNC(biweight_h264_pixels16x8, depth);\
-    c->biweight_h264_pixels_tab[2]= FUNC(biweight_h264_pixels8x16, depth);\
-    c->biweight_h264_pixels_tab[3]= FUNC(biweight_h264_pixels8x8, depth);\
-    c->biweight_h264_pixels_tab[4]= FUNC(biweight_h264_pixels8x4, depth);\
-    c->biweight_h264_pixels_tab[5]= FUNC(biweight_h264_pixels4x8, depth);\
-    c->biweight_h264_pixels_tab[6]= FUNC(biweight_h264_pixels4x4, depth);\
-    c->biweight_h264_pixels_tab[7]= FUNC(biweight_h264_pixels4x2, depth);\
-    c->biweight_h264_pixels_tab[8]= FUNC(biweight_h264_pixels2x4, depth);\
-    c->biweight_h264_pixels_tab[9]= FUNC(biweight_h264_pixels2x2, depth);\
+    c->weight_h264_pixels_tab[0]= FUNC(weight_h264_pixels16, depth);\
+    c->weight_h264_pixels_tab[1]= FUNC(weight_h264_pixels8, depth);\
+    c->weight_h264_pixels_tab[2]= FUNC(weight_h264_pixels4, depth);\
+    c->weight_h264_pixels_tab[3]= FUNC(weight_h264_pixels2, depth);\
+    c->biweight_h264_pixels_tab[0]= FUNC(biweight_h264_pixels16, depth);\
+    c->biweight_h264_pixels_tab[1]= FUNC(biweight_h264_pixels8, depth);\
+    c->biweight_h264_pixels_tab[2]= FUNC(biweight_h264_pixels4, depth);\
+    c->biweight_h264_pixels_tab[3]= FUNC(biweight_h264_pixels2, depth);\
 \
     c->h264_v_loop_filter_luma= FUNC(h264_v_loop_filter_luma, depth);\
     c->h264_h_loop_filter_luma= FUNC(h264_h_loop_filter_luma, depth);\
diff --git a/libavcodec/h264dsp.h b/libavcodec/h264dsp.h
index 7337f178e9..7cae215a95 100644
--- a/libavcodec/h264dsp.h
+++ b/libavcodec/h264dsp.h
@@ -31,16 +31,18 @@
 #include "dsputil.h"
 
 //typedef void (*h264_chroma_mc_func)(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int srcStride, int h, int x, int y);
-typedef void (*h264_weight_func)(uint8_t *block, int stride, int log2_denom, int weight, int offset);
-typedef void (*h264_biweight_func)(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset);
+typedef void (*h264_weight_func)(uint8_t *block, int stride, int height,
+                                 int log2_denom, int weight, int offset);
+typedef void (*h264_biweight_func)(uint8_t *dst, uint8_t *src, int stride, int height,
+                                   int log2_denom, int weightd, int weights, int offset);
 
 /**
  * Context for storing H.264 DSP functions
  */
 typedef struct H264DSPContext{
     /* weighted MC */
-    h264_weight_func weight_h264_pixels_tab[10];
-    h264_biweight_func biweight_h264_pixels_tab[10];
+    h264_weight_func weight_h264_pixels_tab[4];
+    h264_biweight_func biweight_h264_pixels_tab[4];
 
     /* loop filter */
     void (*h264_v_loop_filter_luma)(uint8_t *pix/*align 16*/, int stride, int alpha, int beta, int8_t *tc0);
diff --git a/libavcodec/h264dsp_template.c b/libavcodec/h264dsp_template.c
index ee4bbe51dc..3d99cfcfec 100644
--- a/libavcodec/h264dsp_template.c
+++ b/libavcodec/h264dsp_template.c
@@ -29,14 +29,16 @@
 
 #define op_scale1(x)  block[x] = av_clip_pixel( (block[x]*weight + offset) >> log2_denom )
 #define op_scale2(x)  dst[x] = av_clip_pixel( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
-#define H264_WEIGHT(W,H) \
-static void FUNCC(weight_h264_pixels ## W ## x ## H)(uint8_t *_block, int stride, int log2_denom, int weight, int offset){ \
+#define H264_WEIGHT(W) \
+static void FUNCC(weight_h264_pixels ## W)(uint8_t *_block, int stride, int height, \
+                                           int log2_denom, int weight, int offset) \
+{ \
     int y; \
     pixel *block = (pixel*)_block; \
     stride /= sizeof(pixel); \
     offset <<= (log2_denom + (BIT_DEPTH-8)); \
     if(log2_denom) offset += 1<<(log2_denom-1); \
-    for(y=0; y<H; y++, block += stride){ \
+    for (y = 0; y < height; y++, block += stride) { \
         op_scale1(0); \
         op_scale1(1); \
         if(W==2) continue; \
@@ -58,14 +60,16 @@ static void FUNCC(weight_h264_pixels ## W ## x ## H)(uint8_t *_block, int stride
         op_scale1(15); \
     } \
 } \
-static void FUNCC(biweight_h264_pixels ## W ## x ## H)(uint8_t *_dst, uint8_t *_src, int stride, int log2_denom, int weightd, int weights, int offset){ \
+static void FUNCC(biweight_h264_pixels ## W)(uint8_t *_dst, uint8_t *_src, int stride, int height, \
+                                             int log2_denom, int weightd, int weights, int offset) \
+{ \
     int y; \
     pixel *dst = (pixel*)_dst; \
     pixel *src = (pixel*)_src; \
     stride /= sizeof(pixel); \
     offset <<= (BIT_DEPTH-8); \
     offset = ((offset + 1) | 1) << log2_denom; \
-    for(y=0; y<H; y++, dst += stride, src += stride){ \
+    for (y = 0; y < height; y++, dst += stride, src += stride) { \
         op_scale2(0); \
         op_scale2(1); \
         if(W==2) continue; \
@@ -88,16 +92,10 @@ static void FUNCC(biweight_h264_pixels ## W ## x ## H)(uint8_t *_dst, uint8_t *_
     } \
 }
 
-H264_WEIGHT(16,16)
-H264_WEIGHT(16,8)
-H264_WEIGHT(8,16)
-H264_WEIGHT(8,8)
-H264_WEIGHT(8,4)
-H264_WEIGHT(4,8)
-H264_WEIGHT(4,4)
-H264_WEIGHT(4,2)
-H264_WEIGHT(2,4)
-H264_WEIGHT(2,2)
+H264_WEIGHT(16)
+H264_WEIGHT(8)
+H264_WEIGHT(4)
+H264_WEIGHT(2)
 
 #undef op_scale1
 #undef op_scale2
diff --git a/libavcodec/ppc/h264_altivec.c b/libavcodec/ppc/h264_altivec.c
index a9153788de..edc043c3c7 100644
--- a/libavcodec/ppc/h264_altivec.c
+++ b/libavcodec/ppc/h264_altivec.c
@@ -843,7 +843,8 @@ static void h264_h_loop_filter_luma_altivec(uint8_t *pix, int stride, int alpha,
 }
 
 static av_always_inline
-void weight_h264_WxH_altivec(uint8_t *block, int stride, int log2_denom, int weight, int offset, int w, int h)
+void weight_h264_W_altivec(uint8_t *block, int stride, int height,
+                           int log2_denom, int weight, int offset, int w)
 {
     int y, aligned;
     vec_u8 vblock;
@@ -864,7 +865,7 @@ void weight_h264_WxH_altivec(uint8_t *block, int stride, int log2_denom, int wei
     voffset = vec_splat(vtemp, 5);
     aligned = !((unsigned long)block & 0xf);
 
-    for (y=0; y<h; y++) {
+    for (y = 0; y < height; y++) {
         vblock = vec_ld(0, block);
 
         v0 = (vec_s16)vec_mergeh(zero_u8v, vblock);
@@ -888,8 +889,8 @@ void weight_h264_WxH_altivec(uint8_t *block, int stride, int log2_denom, int wei
 }
 
 static av_always_inline
-void biweight_h264_WxH_altivec(uint8_t *dst, uint8_t *src, int stride, int log2_denom,
-                               int weightd, int weights, int offset, int w, int h)
+void biweight_h264_W_altivec(uint8_t *dst, uint8_t *src, int stride, int height,
+                             int log2_denom, int weightd, int weights, int offset, int w)
 {
     int y, dst_aligned, src_aligned;
     vec_u8 vsrc, vdst;
@@ -912,7 +913,7 @@ void biweight_h264_WxH_altivec(uint8_t *dst, uint8_t *src, int stride, int log2_
     dst_aligned = !((unsigned long)dst & 0xf);
     src_aligned = !((unsigned long)src & 0xf);
 
-    for (y=0; y<h; y++) {
+    for (y = 0; y < height; y++) {
         vdst = vec_ld(0, dst);
         vsrc = vec_ld(0, src);
 
@@ -952,19 +953,18 @@ void biweight_h264_WxH_altivec(uint8_t *dst, uint8_t *src, int stride, int log2_
     }
 }
 
-#define H264_WEIGHT(W,H) \
-static void ff_weight_h264_pixels ## W ## x ## H ## _altivec(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
-    weight_h264_WxH_altivec(block, stride, log2_denom, weight, offset, W, H); \
+#define H264_WEIGHT(W) \
+static void ff_weight_h264_pixels ## W ## _altivec(uint8_t *block, int stride, int height, \
+                                                   int log2_denom, int weight, int offset){ \
+    weight_h264_WxH_altivec(block, stride, height, log2_denom, weight, offset, W); \
 }\
-static void ff_biweight_h264_pixels ## W ## x ## H ## _altivec(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
-    biweight_h264_WxH_altivec(dst, src, stride, log2_denom, weightd, weights, offset, W, H); \
+static void ff_biweight_h264_pixels ## W ## _altivec(uint8_t *dst, uint8_t *src, int stride, int height, \
+                                                     int log2_denom, int weightd, int weights, int offset){ \
+    biweight_h264_WxH_altivec(dst, src, stride, height, log2_denom, weightd, weights, offset, W); \
 }
 
-H264_WEIGHT(16,16)
-H264_WEIGHT(16, 8)
-H264_WEIGHT( 8,16)
-H264_WEIGHT( 8, 8)
-H264_WEIGHT( 8, 4)
+H264_WEIGHT(16)
+H264_WEIGHT( 8)
 
 void dsputil_h264_init_ppc(DSPContext* c, AVCodecContext *avctx) {
     const int high_bit_depth = avctx->bits_per_raw_sample > 8;
@@ -1015,16 +1015,10 @@ void ff_h264dsp_init_ppc(H264DSPContext *c, const int bit_depth, const int chrom
         c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_altivec;
         c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_altivec;
 
-        c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels16x16_altivec;
-        c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels16x8_altivec;
-        c->weight_h264_pixels_tab[2] = ff_weight_h264_pixels8x16_altivec;
-        c->weight_h264_pixels_tab[3] = ff_weight_h264_pixels8x8_altivec;
-        c->weight_h264_pixels_tab[4] = ff_weight_h264_pixels8x4_altivec;
-        c->biweight_h264_pixels_tab[0] = ff_biweight_h264_pixels16x16_altivec;
-        c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels16x8_altivec;
-        c->biweight_h264_pixels_tab[2] = ff_biweight_h264_pixels8x16_altivec;
-        c->biweight_h264_pixels_tab[3] = ff_biweight_h264_pixels8x8_altivec;
-        c->biweight_h264_pixels_tab[4] = ff_biweight_h264_pixels8x4_altivec;
+        c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels16_altivec;
+        c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels8_altivec;
+        c->biweight_h264_pixels_tab[0] = ff_biweight_h264_pixels16_altivec;
+        c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels8_altivec;
     }
     }
 }
diff --git a/libavcodec/x86/h264_weight.asm b/libavcodec/x86/h264_weight.asm
index d80ca32583..bc8bfd686e 100644
--- a/libavcodec/x86/h264_weight.asm
+++ b/libavcodec/x86/h264_weight.asm
@@ -28,21 +28,20 @@ SECTION .text
 ;-----------------------------------------------------------------------------
 ; biweight pred:
 ;
-; void h264_biweight_16x16_sse2(uint8_t *dst, uint8_t *src, int stride,
-;                               int log2_denom, int weightd, int weights,
-;                               int offset);
+; void h264_biweight_16_sse2(uint8_t *dst, uint8_t *src, int stride,
+;                            int height, int log2_denom, int weightd,
+;                            int weights, int offset);
 ; and
-; void h264_weight_16x16_sse2(uint8_t *dst, int stride,
-;                             int log2_denom, int weight,
-;                             int offset);
+; void h264_weight_16_sse2(uint8_t *dst, int stride, int height,
+;                          int log2_denom, int weight, int offset);
 ;-----------------------------------------------------------------------------
 
 %macro WEIGHT_SETUP 0
-    add        r4, r4
-    inc        r4
-    movd       m3, r3d
-    movd       m5, r4d
-    movd       m6, r2d
+    add        r5, r5
+    inc        r5
+    movd       m3, r4d
+    movd       m5, r5d
+    movd       m6, r3d
     pslld      m5, m6
     psrld      m5, 1
 %if mmsize == 16
@@ -71,60 +70,41 @@ SECTION .text
     packuswb      m0, m1
 %endmacro
 
-%macro WEIGHT_FUNC_DBL_MM 1
-cglobal h264_weight_16x%1_mmx2, 5, 5, 0
+INIT_MMX
+cglobal h264_weight_16_mmx2, 6, 6, 0
     WEIGHT_SETUP
-    mov        r2, %1
-%if %1 == 16
 .nextrow
     WEIGHT_OP 0,  4
     mova     [r0  ], m0
     WEIGHT_OP 8, 12
     mova     [r0+8], m0
     add        r0, r1
-    dec        r2
+    dec        r2d
     jnz .nextrow
     REP_RET
-%else
-    jmp mangle(ff_h264_weight_16x16_mmx2.nextrow)
-%endif
-%endmacro
 
-INIT_MMX
-WEIGHT_FUNC_DBL_MM 16
-WEIGHT_FUNC_DBL_MM  8
-
-%macro WEIGHT_FUNC_MM 4
-cglobal h264_weight_%1x%2_%4, 7, 7, %3
+%macro WEIGHT_FUNC_MM 3
+cglobal h264_weight_%1_%3, 6, 6, %2
     WEIGHT_SETUP
-    mov        r2, %2
-%if %2 == 16
 .nextrow
     WEIGHT_OP 0, mmsize/2
     mova     [r0], m0
     add        r0, r1
-    dec        r2
+    dec        r2d
     jnz .nextrow
     REP_RET
-%else
-    jmp mangle(ff_h264_weight_%1x16_%4.nextrow)
-%endif
 %endmacro
 
 INIT_MMX
-WEIGHT_FUNC_MM  8, 16,  0, mmx2
-WEIGHT_FUNC_MM  8,  8,  0, mmx2
-WEIGHT_FUNC_MM  8,  4,  0, mmx2
+WEIGHT_FUNC_MM  8, 0, mmx2
 INIT_XMM
-WEIGHT_FUNC_MM 16, 16,  8, sse2
-WEIGHT_FUNC_MM 16,  8,  8, sse2
+WEIGHT_FUNC_MM 16, 8, sse2
 
-%macro WEIGHT_FUNC_HALF_MM 5
-cglobal h264_weight_%1x%2_%5, 5, 5, %4
+%macro WEIGHT_FUNC_HALF_MM 3
+cglobal h264_weight_%1_%3, 6, 6, %2
     WEIGHT_SETUP
-    mov        r2, %2/2
+    sar       r2d, 1
     lea        r3, [r1*2]
-%if %2 == mmsize
 .nextrow
     WEIGHT_OP 0, r1
     movh     [r0], m0
@@ -135,31 +115,34 @@ cglobal h264_weight_%1x%2_%5, 5, 5, %4
     movh     [r0+r1], m0
 %endif
     add        r0, r3
-    dec        r2
+    dec        r2d
     jnz .nextrow
     REP_RET
-%else
-    jmp mangle(ff_h264_weight_%1x%3_%5.nextrow)
-%endif
 %endmacro
 
 INIT_MMX
-WEIGHT_FUNC_HALF_MM 4,  8,  8, 0, mmx2
-WEIGHT_FUNC_HALF_MM 4,  4,  8, 0, mmx2
-WEIGHT_FUNC_HALF_MM 4,  2,  8, 0, mmx2
+WEIGHT_FUNC_HALF_MM 4, 0, mmx2
+WEIGHT_FUNC_HALF_MM 4, 0, mmx2
+WEIGHT_FUNC_HALF_MM 4, 0, mmx2
 INIT_XMM
-WEIGHT_FUNC_HALF_MM 8, 16, 16, 8, sse2
-WEIGHT_FUNC_HALF_MM 8,  8, 16, 8, sse2
-WEIGHT_FUNC_HALF_MM 8,  4, 16, 8, sse2
+WEIGHT_FUNC_HALF_MM 8, 8, sse2
+WEIGHT_FUNC_HALF_MM 8, 8, sse2
+WEIGHT_FUNC_HALF_MM 8, 8, sse2
 
 %macro BIWEIGHT_SETUP 0
-    add        r6, 1
-    or         r6, 1
-    add        r3, 1
-    movd       m3, r4d
-    movd       m4, r5d
-    movd       m5, r6d
-    movd       m6, r3d
+%ifdef ARCH_X86_64
+%define off_regd r11d
+%else
+%define off_regd r3d
+%endif
+    mov  off_regd, r7m
+    add  off_regd, 1
+    or   off_regd, 1
+    add        r4, 1
+    movd       m3, r5d
+    movd       m4, r6d
+    movd       m5, off_regd
+    movd       m6, r4d
     pslld      m5, m6
     psrld      m5, 1
 %if mmsize == 16
@@ -195,11 +178,10 @@ WEIGHT_FUNC_HALF_MM 8,  4, 16, 8, sse2
     packuswb   m0, m1
 %endmacro
 
-%macro BIWEIGHT_FUNC_DBL_MM 1
-cglobal h264_biweight_16x%1_mmx2, 7, 7, 0
+INIT_MMX
+cglobal h264_biweight_16_mmx2, 7, 7, 0
     BIWEIGHT_SETUP
-    mov        r3, %1
-%if %1 == 16
+    movifnidn r3d, r3m
 .nextrow
     BIWEIGHT_STEPA 0, 1, 0
     BIWEIGHT_STEPA 1, 2, 4
@@ -211,23 +193,14 @@ cglobal h264_biweight_16x%1_mmx2, 7, 7, 0
     mova     [r0+8], m0
     add        r0, r2
     add        r1, r2
-    dec        r3
+    dec        r3d
     jnz .nextrow
     REP_RET
-%else
-    jmp mangle(ff_h264_biweight_16x16_mmx2.nextrow)
-%endif
-%endmacro
 
-INIT_MMX
-BIWEIGHT_FUNC_DBL_MM 16
-BIWEIGHT_FUNC_DBL_MM  8
-
-%macro BIWEIGHT_FUNC_MM 4
-cglobal h264_biweight_%1x%2_%4, 7, 7, %3
+%macro BIWEIGHT_FUNC_MM 3
+cglobal h264_biweight_%1_%3, 7, 7, %2
     BIWEIGHT_SETUP
-    mov        r3, %2
-%if %2 == 16
+    movifnidn r3d, r3m
 .nextrow
     BIWEIGHT_STEPA 0, 1, 0
     BIWEIGHT_STEPA 1, 2, mmsize/2
@@ -235,28 +208,22 @@ cglobal h264_biweight_%1x%2_%4, 7, 7, %3
     mova       [r0], m0
     add        r0, r2
     add        r1, r2
-    dec        r3
+    dec        r3d
     jnz .nextrow
     REP_RET
-%else
-    jmp mangle(ff_h264_biweight_%1x16_%4.nextrow)
-%endif
 %endmacro
 
 INIT_MMX
-BIWEIGHT_FUNC_MM  8, 16,  0, mmx2
-BIWEIGHT_FUNC_MM  8,  8,  0, mmx2
-BIWEIGHT_FUNC_MM  8,  4,  0, mmx2
+BIWEIGHT_FUNC_MM  8, 0, mmx2
 INIT_XMM
-BIWEIGHT_FUNC_MM 16, 16,  8, sse2
-BIWEIGHT_FUNC_MM 16,  8,  8, sse2
+BIWEIGHT_FUNC_MM 16, 8, sse2
 
-%macro BIWEIGHT_FUNC_HALF_MM 5
-cglobal h264_biweight_%1x%2_%5, 7, 7, %4
+%macro BIWEIGHT_FUNC_HALF_MM 3
+cglobal h264_biweight_%1_%3, 7, 7, %2
     BIWEIGHT_SETUP
-    mov        r3, %2/2
+    movifnidn r3d, r3m
+    sar        r3, 1
     lea        r4, [r2*2]
-%if %2 == mmsize
 .nextrow
     BIWEIGHT_STEPA 0, 1, 0
     BIWEIGHT_STEPA 1, 2, r2
@@ -270,31 +237,30 @@ cglobal h264_biweight_%1x%2_%5, 7, 7, %4
 %endif
     add        r0, r4
     add        r1, r4
-    dec        r3
+    dec        r3d
     jnz .nextrow
     REP_RET
-%else
-    jmp mangle(ff_h264_biweight_%1x%3_%5.nextrow)
-%endif
 %endmacro
 
 INIT_MMX
-BIWEIGHT_FUNC_HALF_MM 4,  8,  8, 0, mmx2
-BIWEIGHT_FUNC_HALF_MM 4,  4,  8, 0, mmx2
-BIWEIGHT_FUNC_HALF_MM 4,  2,  8, 0, mmx2
+BIWEIGHT_FUNC_HALF_MM 4, 0, mmx2
 INIT_XMM
-BIWEIGHT_FUNC_HALF_MM 8, 16, 16, 8, sse2
-BIWEIGHT_FUNC_HALF_MM 8,  8, 16, 8, sse2
-BIWEIGHT_FUNC_HALF_MM 8,  4, 16, 8, sse2
+BIWEIGHT_FUNC_HALF_MM 8, 8, sse2
 
 %macro BIWEIGHT_SSSE3_SETUP 0
-    add        r6, 1
-    or         r6, 1
-    add        r3, 1
-    movd       m4, r4d
-    movd       m0, r5d
-    movd       m5, r6d
-    movd       m6, r3d
+%ifdef ARCH_X86_64
+%define off_regd r11d
+%else
+%define off_regd r3d
+%endif
+    mov  off_regd, r7m
+    add  off_regd, 1
+    or   off_regd, 1
+    add        r4, 1
+    movd       m4, r5d
+    movd       m0, r6d
+    movd       m5, off_regd
+    movd       m6, r4d
     pslld      m5, m6
     psrld      m5, 1
     punpcklbw  m4, m0
@@ -314,12 +280,11 @@ BIWEIGHT_FUNC_HALF_MM 8,  4, 16, 8, sse2
     packuswb   m0, m2
 %endmacro
 
-%macro BIWEIGHT_SSSE3_16 1
-cglobal h264_biweight_16x%1_ssse3, 7, 7, 8
+INIT_XMM
+cglobal h264_biweight_16_ssse3, 7, 7, 8
     BIWEIGHT_SSSE3_SETUP
-    mov        r3, %1
+    movifnidn r3d, r3m
 
-%if %1 == 16
 .nextrow
     movh       m0, [r0]
     movh       m2, [r0+8]
@@ -330,25 +295,17 @@ cglobal h264_biweight_16x%1_ssse3, 7, 7, 8
     mova       [r0], m0
     add        r0, r2
     add        r1, r2
-    dec        r3
+    dec        r3d
     jnz .nextrow
     REP_RET
-%else
-    jmp mangle(ff_h264_biweight_16x16_ssse3.nextrow)
-%endif
-%endmacro
 
 INIT_XMM
-BIWEIGHT_SSSE3_16 16
-BIWEIGHT_SSSE3_16  8
-
-%macro BIWEIGHT_SSSE3_8 1
-cglobal h264_biweight_8x%1_ssse3, 7, 7, 8
+cglobal h264_biweight_8_ssse3, 7, 7, 8
     BIWEIGHT_SSSE3_SETUP
-    mov        r3, %1/2
+    movifnidn r3d, r3m
+    sar        r3, 1
     lea        r4, [r2*2]
 
-%if %1 == 16
 .nextrow
     movh       m0, [r0]
     movh       m1, [r1]
@@ -361,15 +318,6 @@ cglobal h264_biweight_8x%1_ssse3, 7, 7, 8
     movhps     [r0+r2], m0
     add        r0, r4
     add        r1, r4
-    dec        r3
+    dec        r3d
     jnz .nextrow
     REP_RET
-%else
-    jmp mangle(ff_h264_biweight_8x16_ssse3.nextrow)
-%endif
-%endmacro
-
-INIT_XMM
-BIWEIGHT_SSSE3_8 16
-BIWEIGHT_SSSE3_8  8
-BIWEIGHT_SSSE3_8  4
diff --git a/libavcodec/x86/h264_weight_10bit.asm b/libavcodec/x86/h264_weight_10bit.asm
index 1c58d72d94..20df6fbab5 100644
--- a/libavcodec/x86/h264_weight_10bit.asm
+++ b/libavcodec/x86/h264_weight_10bit.asm
@@ -36,33 +36,26 @@ cextern pw_1
 SECTION .text
 
 ;-----------------------------------------------------------------------------
-; void h264_weight(uint8_t *dst, int stride, int log2_denom,
+; void h264_weight(uint8_t *dst, int stride, int height, int log2_denom,
 ;                  int weight, int offset);
 ;-----------------------------------------------------------------------------
-%ifdef ARCH_X86_32
-DECLARE_REG_TMP 2
-%else
-DECLARE_REG_TMP 10
-%endif
-
-%macro WEIGHT_PROLOGUE 1
-    mov t0, %1
+%macro WEIGHT_PROLOGUE 0
 .prologue
-    PROLOGUE 0,5,8
+    PROLOGUE 0,6,8
     movifnidn  r0, r0mp
     movifnidn r1d, r1m
-    movifnidn r3d, r3m
     movifnidn r4d, r4m
+    movifnidn r5d, r5m
 %endmacro
 
 %macro WEIGHT_SETUP 1
     mova       m0, [pw_1]
-    movd       m2, r2m
+    movd       m2, r3m
     pslld      m0, m2       ; 1<<log2_denom
     SPLATW     m0, m0
-    shl        r4, 19       ; *8, move to upper half of dword
-    lea        r4, [r4+r3*2+0x10000]
-    movd       m3, r4d      ; weight<<1 | 1+(offset<<(3))
+    shl        r5, 19       ; *8, move to upper half of dword
+    lea        r5, [r5+r4*2+0x10000]
+    movd       m3, r5d      ; weight<<1 | 1+(offset<<(3))
     pshufd     m3, m3, 0
     mova       m4, [pw_pixel_max]
     paddw      m2, [sq_1]   ; log2_denom+1
@@ -96,8 +89,8 @@ DECLARE_REG_TMP 10
 %endmacro
 
 %macro WEIGHT_FUNC_DBL 1
-cglobal h264_weight_16x16_10_%1
-    WEIGHT_PROLOGUE 16
+cglobal h264_weight_16_10_%1
+    WEIGHT_PROLOGUE
     WEIGHT_SETUP %1
 .nextrow
     WEIGHT_OP %1,  0
@@ -105,13 +98,9 @@ cglobal h264_weight_16x16_10_%1
     WEIGHT_OP %1, 16
     mova [r0+16], m5
     add       r0, r1
-    dec       t0
+    dec       r2d
     jnz .nextrow
     REP_RET
-
-cglobal h264_weight_16x8_10_%1
-    mov t0, 8
-    jmp mangle(ff_h264_weight_16x16_10_%1.prologue)
 %endmacro
 
 INIT_XMM
@@ -120,24 +109,16 @@ WEIGHT_FUNC_DBL sse4
 
 
 %macro WEIGHT_FUNC_MM 1
-cglobal h264_weight_8x16_10_%1
-    WEIGHT_PROLOGUE 16
+cglobal h264_weight_8_10_%1
+    WEIGHT_PROLOGUE
     WEIGHT_SETUP %1
 .nextrow
     WEIGHT_OP  %1, 0
     mova     [r0], m5
     add        r0, r1
-    dec        t0
+    dec        r2d
     jnz .nextrow
     REP_RET
-
-cglobal h264_weight_8x8_10_%1
-    mov t0, 8
-    jmp mangle(ff_h264_weight_8x16_10_%1.prologue)
-
-cglobal h264_weight_8x4_10_%1
-    mov t0, 4
-    jmp mangle(ff_h264_weight_8x16_10_%1.prologue)
 %endmacro
 
 INIT_XMM
@@ -146,8 +127,9 @@ WEIGHT_FUNC_MM sse4
 
 
 %macro WEIGHT_FUNC_HALF_MM 1
-cglobal h264_weight_4x8_10_%1
-    WEIGHT_PROLOGUE 4
+cglobal h264_weight_4_10_%1
+    WEIGHT_PROLOGUE
+    sar         r2d, 1
     WEIGHT_SETUP %1
     lea         r3, [r1*2]
 .nextrow
@@ -155,17 +137,9 @@ cglobal h264_weight_4x8_10_%1
     movh      [r0], m5
     movhps [r0+r1], m5
     add         r0, r3
-    dec         t0
+    dec         r2d
     jnz .nextrow
     REP_RET
-
-cglobal h264_weight_4x4_10_%1
-    mov t0, 2
-    jmp mangle(ff_h264_weight_4x8_10_%1.prologue)
-
-cglobal h264_weight_4x2_10_%1
-    mov t0, 1
-    jmp mangle(ff_h264_weight_4x8_10_%1.prologue)
 %endmacro
 
 INIT_XMM
@@ -174,40 +148,40 @@ WEIGHT_FUNC_HALF_MM sse4
 
 
 ;-----------------------------------------------------------------------------
-; void h264_biweight(uint8_t *dst, uint8_t *src, int stride, int log2_denom,
-;                    int weightd, int weights, int offset);
+; void h264_biweight(uint8_t *dst, uint8_t *src, int stride, int height,
+;                    int log2_denom, int weightd, int weights, int offset);
 ;-----------------------------------------------------------------------------
 %ifdef ARCH_X86_32
-DECLARE_REG_TMP 2,3
+DECLARE_REG_TMP 3
 %else
-DECLARE_REG_TMP 10,2
+DECLARE_REG_TMP 10
 %endif
 
-%macro BIWEIGHT_PROLOGUE 1
-    mov t0, %1
+%macro BIWEIGHT_PROLOGUE 0
 .prologue
     PROLOGUE 0,7,8
     movifnidn  r0, r0mp
     movifnidn  r1, r1mp
-    movifnidn t1d, r2m
-    movifnidn r4d, r4m
+    movifnidn r2d, r2m
     movifnidn r5d, r5m
     movifnidn r6d, r6m
+    movifnidn t0d, r7m
 %endmacro
 
 %macro BIWEIGHT_SETUP 1
-    lea        r6, [r6*4+1] ; (offset<<2)+1
-    or         r6, 1
-    shl        r5, 16
-    or         r4, r5
-    movd       m4, r4d      ; weightd | weights
-    movd       m5, r6d      ; (offset+1)|1
-    movd       m6, r3m      ; log2_denom
+    lea        t0, [t0*4+1] ; (offset<<2)+1
+    or         t0, 1
+    shl        r6, 16
+    or         r5, r6
+    movd       m4, r5d      ; weightd | weights
+    movd       m5, t0d      ; (offset+1)|1
+    movd       m6, r4m      ; log2_denom
     pslld      m5, m6       ; (((offset<<2)+1)|1)<<log2_denom
     paddd      m6, [sq_1]
     pshufd     m4, m4, 0
     pshufd     m5, m5, 0
     mova       m3, [pw_pixel_max]
+    movifnidn r3d, r3m
 %ifnidn %1, sse4
     pxor       m7, m7
 %endif
@@ -243,23 +217,19 @@ DECLARE_REG_TMP 10,2
 %endmacro
 
 %macro BIWEIGHT_FUNC_DBL 1
-cglobal h264_biweight_16x16_10_%1
-    BIWEIGHT_PROLOGUE 16
+cglobal h264_biweight_16_10_%1
+    BIWEIGHT_PROLOGUE
     BIWEIGHT_SETUP %1
 .nextrow
     BIWEIGHT  %1,  0
     mova [r0   ], m0
     BIWEIGHT  %1, 16
     mova [r0+16], m0
-    add       r0, t1
-    add       r1, t1
-    dec       t0
+    add       r0, r2
+    add       r1, r2
+    dec       r3d
     jnz .nextrow
     REP_RET
-
-cglobal h264_biweight_16x8_10_%1
-    mov t0, 8
-    jmp mangle(ff_h264_biweight_16x16_10_%1.prologue)
 %endmacro
 
 INIT_XMM
@@ -267,25 +237,17 @@ BIWEIGHT_FUNC_DBL sse2
 BIWEIGHT_FUNC_DBL sse4
 
 %macro BIWEIGHT_FUNC 1
-cglobal h264_biweight_8x16_10_%1
-    BIWEIGHT_PROLOGUE 16
+cglobal h264_biweight_8_10_%1
+    BIWEIGHT_PROLOGUE
     BIWEIGHT_SETUP %1
 .nextrow
     BIWEIGHT %1, 0
     mova   [r0], m0
-    add      r0, t1
-    add      r1, t1
-    dec      t0
+    add      r0, r2
+    add      r1, r2
+    dec      r3d
     jnz .nextrow
     REP_RET
-
-cglobal h264_biweight_8x8_10_%1
-    mov t0, 8
-    jmp mangle(ff_h264_biweight_8x16_10_%1.prologue)
-
-cglobal h264_biweight_8x4_10_%1
-    mov t0, 4
-    jmp mangle(ff_h264_biweight_8x16_10_%1.prologue)
 %endmacro
 
 INIT_XMM
@@ -293,27 +255,20 @@ BIWEIGHT_FUNC sse2
 BIWEIGHT_FUNC sse4
 
 %macro BIWEIGHT_FUNC_HALF 1
-cglobal h264_biweight_4x8_10_%1
-    BIWEIGHT_PROLOGUE 4
+cglobal h264_biweight_4_10_%1
+    BIWEIGHT_PROLOGUE
     BIWEIGHT_SETUP %1
-    lea        r4, [t1*2]
+    sar        r3d, 1
+    lea        r4, [r2*2]
 .nextrow
-    BIWEIGHT    %1, 0, t1
+    BIWEIGHT    %1, 0, r2
     movh   [r0   ], m0
-    movhps [r0+t1], m0
+    movhps [r0+r2], m0
     add         r0, r4
     add         r1, r4
-    dec         t0
+    dec         r3d
     jnz .nextrow
     REP_RET
-
-cglobal h264_biweight_4x4_10_%1
-    mov t0, 2
-    jmp mangle(ff_h264_biweight_4x8_10_%1.prologue)
-
-cglobal h264_biweight_4x2_10_%1
-    mov t0, 1
-    jmp mangle(ff_h264_biweight_4x8_10_%1.prologue)
 %endmacro
 
 INIT_XMM
diff --git a/libavcodec/x86/h264dsp_mmx.c b/libavcodec/x86/h264dsp_mmx.c
index 06ee7cad43..dcd918013c 100644
--- a/libavcodec/x86/h264dsp_mmx.c
+++ b/libavcodec/x86/h264dsp_mmx.c
@@ -298,57 +298,47 @@ LF_IFUNC(v,  luma_intra,      10, mmxext)
 /***********************************/
 /* weighted prediction */
 
-#define H264_WEIGHT(W, H, OPT) \
-void ff_h264_weight_ ## W ## x ## H ## _ ## OPT(uint8_t *dst, \
-    int stride, int log2_denom, int weight, int offset);
+#define H264_WEIGHT(W, OPT) \
+void ff_h264_weight_ ## W ## _ ## OPT(uint8_t *dst, \
+    int stride, int height, int log2_denom, int weight, int offset);
 
-#define H264_BIWEIGHT(W, H, OPT) \
-void ff_h264_biweight_ ## W ## x ## H ## _ ## OPT(uint8_t *dst, \
-    uint8_t *src, int stride, int log2_denom, int weightd, \
+#define H264_BIWEIGHT(W, OPT) \
+void ff_h264_biweight_ ## W ## _ ## OPT(uint8_t *dst, \
+    uint8_t *src, int stride, int height, int log2_denom, int weightd, \
     int weights, int offset);
 
-#define H264_BIWEIGHT_MMX(W,H) \
-H264_WEIGHT  (W, H, mmx2) \
-H264_BIWEIGHT(W, H, mmx2)
-
-#define H264_BIWEIGHT_MMX_SSE(W,H) \
-H264_BIWEIGHT_MMX(W, H) \
-H264_WEIGHT      (W, H, sse2) \
-H264_BIWEIGHT    (W, H, sse2) \
-H264_BIWEIGHT    (W, H, ssse3)
-
-H264_BIWEIGHT_MMX_SSE(16, 16)
-H264_BIWEIGHT_MMX_SSE(16,  8)
-H264_BIWEIGHT_MMX_SSE( 8, 16)
-H264_BIWEIGHT_MMX_SSE( 8,  8)
-H264_BIWEIGHT_MMX_SSE( 8,  4)
-H264_BIWEIGHT_MMX    ( 4,  8)
-H264_BIWEIGHT_MMX    ( 4,  4)
-H264_BIWEIGHT_MMX    ( 4,  2)
-
-#define H264_WEIGHT_10(W, H, DEPTH, OPT) \
-void ff_h264_weight_ ## W ## x ## H ## _ ## DEPTH ## _ ## OPT(uint8_t *dst, \
-    int stride, int log2_denom, int weight, int offset);
-
-#define H264_BIWEIGHT_10(W, H, DEPTH, OPT) \
-void ff_h264_biweight_ ## W ## x ## H ## _ ## DEPTH ## _ ## OPT \
-    (uint8_t *dst, uint8_t *src, int stride, int log2_denom, \
+#define H264_BIWEIGHT_MMX(W) \
+H264_WEIGHT  (W, mmx2) \
+H264_BIWEIGHT(W, mmx2)
+
+#define H264_BIWEIGHT_MMX_SSE(W) \
+H264_BIWEIGHT_MMX(W) \
+H264_WEIGHT      (W, sse2) \
+H264_BIWEIGHT    (W, sse2) \
+H264_BIWEIGHT    (W, ssse3)
+
+H264_BIWEIGHT_MMX_SSE(16)
+H264_BIWEIGHT_MMX_SSE( 8)
+H264_BIWEIGHT_MMX    ( 4)
+
+#define H264_WEIGHT_10(W, DEPTH, OPT) \
+void ff_h264_weight_ ## W ## _ ## DEPTH ## _ ## OPT(uint8_t *dst, \
+    int stride, int height, int log2_denom, int weight, int offset);
+
+#define H264_BIWEIGHT_10(W, DEPTH, OPT) \
+void ff_h264_biweight_ ## W ## _ ## DEPTH ## _ ## OPT \
+    (uint8_t *dst, uint8_t *src, int stride, int height, int log2_denom, \
      int weightd, int weights, int offset);
 
-#define H264_BIWEIGHT_10_SSE(W, H, DEPTH) \
-H264_WEIGHT_10  (W, H, DEPTH, sse2) \
-H264_WEIGHT_10  (W, H, DEPTH, sse4) \
-H264_BIWEIGHT_10(W, H, DEPTH, sse2) \
-H264_BIWEIGHT_10(W, H, DEPTH, sse4)
-
-H264_BIWEIGHT_10_SSE(16, 16, 10)
-H264_BIWEIGHT_10_SSE(16,  8, 10)
-H264_BIWEIGHT_10_SSE( 8, 16, 10)
-H264_BIWEIGHT_10_SSE( 8,  8, 10)
-H264_BIWEIGHT_10_SSE( 8,  4, 10)
-H264_BIWEIGHT_10_SSE( 4,  8, 10)
-H264_BIWEIGHT_10_SSE( 4,  4, 10)
-H264_BIWEIGHT_10_SSE( 4,  2, 10)
+#define H264_BIWEIGHT_10_SSE(W, DEPTH) \
+H264_WEIGHT_10  (W, DEPTH, sse2) \
+H264_WEIGHT_10  (W, DEPTH, sse4) \
+H264_BIWEIGHT_10(W, DEPTH, sse2) \
+H264_BIWEIGHT_10(W, DEPTH, sse4)
+
+H264_BIWEIGHT_10_SSE(16, 10)
+H264_BIWEIGHT_10_SSE( 8, 10)
+H264_BIWEIGHT_10_SSE( 4, 10)
 
 void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, const int chroma_format_idc)
 {
@@ -394,23 +384,13 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, const int chrom
             c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_mmxext;
             c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_mmxext;
 #endif
-            c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_mmx2;
-            c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_mmx2;
-            c->weight_h264_pixels_tab[2]= ff_h264_weight_8x16_mmx2;
-            c->weight_h264_pixels_tab[3]= ff_h264_weight_8x8_mmx2;
-            c->weight_h264_pixels_tab[4]= ff_h264_weight_8x4_mmx2;
-            c->weight_h264_pixels_tab[5]= ff_h264_weight_4x8_mmx2;
-            c->weight_h264_pixels_tab[6]= ff_h264_weight_4x4_mmx2;
-            c->weight_h264_pixels_tab[7]= ff_h264_weight_4x2_mmx2;
-
-            c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_mmx2;
-            c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_mmx2;
-            c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_mmx2;
-            c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_mmx2;
-            c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_mmx2;
-            c->biweight_h264_pixels_tab[5]= ff_h264_biweight_4x8_mmx2;
-            c->biweight_h264_pixels_tab[6]= ff_h264_biweight_4x4_mmx2;
-            c->biweight_h264_pixels_tab[7]= ff_h264_biweight_4x2_mmx2;
+            c->weight_h264_pixels_tab[0]= ff_h264_weight_16_mmx2;
+            c->weight_h264_pixels_tab[1]= ff_h264_weight_8_mmx2;
+            c->weight_h264_pixels_tab[2]= ff_h264_weight_4_mmx2;
+
+            c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16_mmx2;
+            c->biweight_h264_pixels_tab[1]= ff_h264_biweight_8_mmx2;
+            c->biweight_h264_pixels_tab[2]= ff_h264_biweight_4_mmx2;
 
             if (mm_flags&AV_CPU_FLAG_SSE2) {
                 c->h264_idct8_add           = ff_h264_idct8_add_8_sse2;
@@ -422,17 +402,11 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, const int chrom
                 c->h264_idct_add16intra     = ff_h264_idct_add16intra_8_sse2;
                 c->h264_luma_dc_dequant_idct= ff_h264_luma_dc_dequant_idct_sse2;
 
-                c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_sse2;
-                c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_sse2;
-                c->weight_h264_pixels_tab[2]= ff_h264_weight_8x16_sse2;
-                c->weight_h264_pixels_tab[3]= ff_h264_weight_8x8_sse2;
-                c->weight_h264_pixels_tab[4]= ff_h264_weight_8x4_sse2;
+                c->weight_h264_pixels_tab[0]= ff_h264_weight_16_sse2;
+                c->weight_h264_pixels_tab[1]= ff_h264_weight_8_sse2;
 
-                c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_sse2;
-                c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_sse2;
-                c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_sse2;
-                c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_sse2;
-                c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_sse2;
+                c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16_sse2;
+                c->biweight_h264_pixels_tab[1]= ff_h264_biweight_8_sse2;
 
 #if HAVE_ALIGNED_STACK
                 c->h264_v_loop_filter_luma = ff_deblock_v_luma_8_sse2;
@@ -442,11 +416,8 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, const int chrom
 #endif
             }
             if (mm_flags&AV_CPU_FLAG_SSSE3) {
-                c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_ssse3;
-                c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_ssse3;
-                c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_ssse3;
-                c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_ssse3;
-                c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_ssse3;
+                c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16_ssse3;
+                c->biweight_h264_pixels_tab[1]= ff_h264_biweight_8_ssse3;
             }
             if (mm_flags&AV_CPU_FLAG_AVX) {
 #if HAVE_ALIGNED_STACK
@@ -485,23 +456,13 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, const int chrom
                 c->h264_idct8_add4     = ff_h264_idct8_add4_10_sse2;
 #endif
 
-                c->weight_h264_pixels_tab[0] = ff_h264_weight_16x16_10_sse2;
-                c->weight_h264_pixels_tab[1] = ff_h264_weight_16x8_10_sse2;
-                c->weight_h264_pixels_tab[2] = ff_h264_weight_8x16_10_sse2;
-                c->weight_h264_pixels_tab[3] = ff_h264_weight_8x8_10_sse2;
-                c->weight_h264_pixels_tab[4] = ff_h264_weight_8x4_10_sse2;
-                c->weight_h264_pixels_tab[5] = ff_h264_weight_4x8_10_sse2;
-                c->weight_h264_pixels_tab[6] = ff_h264_weight_4x4_10_sse2;
-                c->weight_h264_pixels_tab[7] = ff_h264_weight_4x2_10_sse2;
-
-                c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16x16_10_sse2;
-                c->biweight_h264_pixels_tab[1] = ff_h264_biweight_16x8_10_sse2;
-                c->biweight_h264_pixels_tab[2] = ff_h264_biweight_8x16_10_sse2;
-                c->biweight_h264_pixels_tab[3] = ff_h264_biweight_8x8_10_sse2;
-                c->biweight_h264_pixels_tab[4] = ff_h264_biweight_8x4_10_sse2;
-                c->biweight_h264_pixels_tab[5] = ff_h264_biweight_4x8_10_sse2;
-                c->biweight_h264_pixels_tab[6] = ff_h264_biweight_4x4_10_sse2;
-                c->biweight_h264_pixels_tab[7] = ff_h264_biweight_4x2_10_sse2;
+                c->weight_h264_pixels_tab[0] = ff_h264_weight_16_10_sse2;
+                c->weight_h264_pixels_tab[1] = ff_h264_weight_8_10_sse2;
+                c->weight_h264_pixels_tab[2] = ff_h264_weight_4_10_sse2;
+
+                c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_10_sse2;
+                c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_10_sse2;
+                c->biweight_h264_pixels_tab[2] = ff_h264_biweight_4_10_sse2;
 
                 c->h264_v_loop_filter_chroma= ff_deblock_v_chroma_10_sse2;
                 c->h264_v_loop_filter_chroma_intra= ff_deblock_v_chroma_intra_10_sse2;
@@ -513,23 +474,13 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, const int chrom
 #endif
             }
             if (mm_flags&AV_CPU_FLAG_SSE4) {
-                c->weight_h264_pixels_tab[0] = ff_h264_weight_16x16_10_sse4;
-                c->weight_h264_pixels_tab[1] = ff_h264_weight_16x8_10_sse4;
-                c->weight_h264_pixels_tab[2] = ff_h264_weight_8x16_10_sse4;
-                c->weight_h264_pixels_tab[3] = ff_h264_weight_8x8_10_sse4;
-                c->weight_h264_pixels_tab[4] = ff_h264_weight_8x4_10_sse4;
-                c->weight_h264_pixels_tab[5] = ff_h264_weight_4x8_10_sse4;
-                c->weight_h264_pixels_tab[6] = ff_h264_weight_4x4_10_sse4;
-                c->weight_h264_pixels_tab[7] = ff_h264_weight_4x2_10_sse4;
-
-                c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16x16_10_sse4;
-                c->biweight_h264_pixels_tab[1] = ff_h264_biweight_16x8_10_sse4;
-                c->biweight_h264_pixels_tab[2] = ff_h264_biweight_8x16_10_sse4;
-                c->biweight_h264_pixels_tab[3] = ff_h264_biweight_8x8_10_sse4;
-                c->biweight_h264_pixels_tab[4] = ff_h264_biweight_8x4_10_sse4;
-                c->biweight_h264_pixels_tab[5] = ff_h264_biweight_4x8_10_sse4;
-                c->biweight_h264_pixels_tab[6] = ff_h264_biweight_4x4_10_sse4;
-                c->biweight_h264_pixels_tab[7] = ff_h264_biweight_4x2_10_sse4;
+                c->weight_h264_pixels_tab[0] = ff_h264_weight_16_10_sse4;
+                c->weight_h264_pixels_tab[1] = ff_h264_weight_8_10_sse4;
+                c->weight_h264_pixels_tab[2] = ff_h264_weight_4_10_sse4;
+
+                c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_10_sse4;
+                c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_10_sse4;
+                c->biweight_h264_pixels_tab[2] = ff_h264_biweight_4_10_sse4;
             }
 #if HAVE_AVX
             if (mm_flags&AV_CPU_FLAG_AVX) {

From 05fb63f5a070154aa7e681fa8617a5322322559a Mon Sep 17 00:00:00 2001
From: "Ronald S. Bultje" <rsbultje@gmail.com>
Date: Fri, 21 Oct 2011 00:01:16 -0700
Subject: [PATCH 08/35] H264: have hl_motion() and its callees take a
 chroma_idc argument.

---
 libavcodec/h264.c | 150 ++++++++++++++++++++++++++++++----------------
 1 file changed, 98 insertions(+), 52 deletions(-)

diff --git a/libavcodec/h264.c b/libavcodec/h264.c
index 7306828197..0525df3712 100644
--- a/libavcodec/h264.c
+++ b/libavcodec/h264.c
@@ -440,10 +440,11 @@ static void chroma_dc_dct_c(DCTELEM *block){
 
 static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square,
                                int height, int delta, int list,
-                           uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
-                           int src_x_offset, int src_y_offset,
-                           qpel_mc_func *qpix_op, h264_chroma_mc_func chroma_op,
-                           int pixel_shift, int chroma444){
+                               uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
+                               int src_x_offset, int src_y_offset,
+                               qpel_mc_func *qpix_op, h264_chroma_mc_func chroma_op,
+                               int pixel_shift, int chroma_idc)
+{
     MpegEncContext * const s = &h->s;
     const int mx= h->mv_cache[list][ scan8[n] ][0] + src_x_offset*8;
     int my=       h->mv_cache[list][ scan8[n] ][1] + src_y_offset*8;
@@ -480,7 +481,7 @@ static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square,
 
     if(CONFIG_GRAY && s->flags&CODEC_FLAG_GRAY) return;
 
-    if(chroma444){
+    if(chroma_idc == 3 /* yuv444 */){
         src_cb = pic->f.data[1] + offset;
         if(emu){
             s->dsp.emulated_edge_mc(s->edge_emu_buffer, src_cb - (2 << pixel_shift) - 2*h->mb_linesize, h->mb_linesize,
@@ -505,8 +506,8 @@ static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square,
         return;
     }
 
-    ysh = 3 - !!(CHROMA422);
-    if(!CHROMA422 && MB_FIELD){
+    ysh = 3 - (chroma_idc == 2 /* yuv422 */);
+    if(chroma_idc == 1 /* yuv420 */ && MB_FIELD){
         // chroma offset when predicting from a field of opposite parity
         my += 2 * ((s->mb_y & 1) - (pic->f.reference - 1));
         emu |= (my>>3) < 0 || (my>>3) + 8 >= (pic_height>>1);
@@ -516,16 +517,22 @@ static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square,
     src_cr = pic->f.data[2] + ((mx >> 3) << pixel_shift) + (my >> ysh) * h->mb_uvlinesize;
 
     if(emu){
-        s->dsp.emulated_edge_mc(s->edge_emu_buffer, src_cb, h->mb_uvlinesize, 9, (16>>!(CHROMA422)) + 1, (mx>>3), (my>>ysh), pic_width>>1, pic_height>>!(CHROMA422));
+        s->dsp.emulated_edge_mc(s->edge_emu_buffer, src_cb, h->mb_uvlinesize,
+                                9, 8 * chroma_idc + 1, (mx >> 3), (my >> ysh),
+                                pic_width >> 1, pic_height >> (chroma_idc == 1 /* yuv420 */));
             src_cb= s->edge_emu_buffer;
     }
-    chroma_op(dest_cb, src_cb, h->mb_uvlinesize, height >> !(CHROMA422), mx&7, (my << !!(CHROMA422)) &7);
+    chroma_op(dest_cb, src_cb, h->mb_uvlinesize, height >> (chroma_idc == 1 /* yuv420 */),
+              mx&7, (my << (chroma_idc == 2 /* yuv422 */)) &7);
 
     if(emu){
-        s->dsp.emulated_edge_mc(s->edge_emu_buffer, src_cr, h->mb_uvlinesize, 9, (16>>!(CHROMA422)) + 1, (mx>>3), (my>>ysh), pic_width>>1, pic_height>>!(CHROMA422));
+        s->dsp.emulated_edge_mc(s->edge_emu_buffer, src_cr, h->mb_uvlinesize,
+                                9, 8 * chroma_idc + 1, (mx >> 3), (my >> ysh),
+                                pic_width >> 1, pic_height >> (chroma_idc == 1 /* yuv420 */));
             src_cr= s->edge_emu_buffer;
     }
-    chroma_op(dest_cr, src_cr, h->mb_uvlinesize, height >> !(CHROMA422), mx&7, (my << !!(CHROMA422)) &7);
+    chroma_op(dest_cr, src_cr, h->mb_uvlinesize, height >> (chroma_idc == 1 /* yuv420 */),
+              mx&7, (my << (chroma_idc == 2 /* yuv422 */)) &7);
 }
 
 static inline void mc_part_std(H264Context *h, int n, int square, int height, int delta,
@@ -533,19 +540,20 @@ static inline void mc_part_std(H264Context *h, int n, int square, int height, in
                            int x_offset, int y_offset,
                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
-                           int list0, int list1, int pixel_shift, int chroma444){
+                           int list0, int list1, int pixel_shift, int chroma_idc)
+{
     MpegEncContext * const s = &h->s;
     qpel_mc_func *qpix_op=  qpix_put;
     h264_chroma_mc_func chroma_op= chroma_put;
 
     dest_y  += (2*x_offset << pixel_shift) + 2*y_offset*h->mb_linesize;
-    if(chroma444){
+    if (chroma_idc == 3 /* yuv444 */) {
         dest_cb += (2*x_offset << pixel_shift) + 2*y_offset*h->mb_linesize;
         dest_cr += (2*x_offset << pixel_shift) + 2*y_offset*h->mb_linesize;
-    } else if (CHROMA422) {
+    } else if (chroma_idc == 2 /* yuv422 */) {
         dest_cb += (  x_offset << pixel_shift) + 2*y_offset*h->mb_uvlinesize;
         dest_cr += (  x_offset << pixel_shift) + 2*y_offset*h->mb_uvlinesize;
-    }else{
+    } else /* yuv420 */ {
         dest_cb += (  x_offset << pixel_shift) +   y_offset*h->mb_uvlinesize;
         dest_cr += (  x_offset << pixel_shift) +   y_offset*h->mb_uvlinesize;
     }
@@ -556,7 +564,7 @@ static inline void mc_part_std(H264Context *h, int n, int square, int height, in
         Picture *ref= &h->ref_list[0][ h->ref_cache[0][ scan8[n] ] ];
         mc_dir_part(h, ref, n, square, height, delta, 0,
                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
-                           qpix_op, chroma_op, pixel_shift, chroma444);
+                           qpix_op, chroma_op, pixel_shift, chroma_idc);
 
         qpix_op=  qpix_avg;
         chroma_op= chroma_avg;
@@ -566,7 +574,7 @@ static inline void mc_part_std(H264Context *h, int n, int square, int height, in
         Picture *ref= &h->ref_list[1][ h->ref_cache[1][ scan8[n] ] ];
         mc_dir_part(h, ref, n, square, height, delta, 1,
                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
-                           qpix_op, chroma_op, pixel_shift, chroma444);
+                           qpix_op, chroma_op, pixel_shift, chroma_idc);
     }
 }
 
@@ -576,22 +584,22 @@ static inline void mc_part_weighted(H264Context *h, int n, int square, int heigh
                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
                            h264_weight_func luma_weight_op, h264_weight_func chroma_weight_op,
                            h264_biweight_func luma_weight_avg, h264_biweight_func chroma_weight_avg,
-                           int list0, int list1, int pixel_shift, int chroma444){
+                           int list0, int list1, int pixel_shift, int chroma_idc){
     MpegEncContext * const s = &h->s;
     int chroma_height;
 
     dest_y += (2*x_offset << pixel_shift) + 2*y_offset*h->mb_linesize;
-    if(chroma444){
+    if (chroma_idc == 3 /* yuv444 */) {
         chroma_height = height;
         chroma_weight_avg = luma_weight_avg;
         chroma_weight_op = luma_weight_op;
         dest_cb += (2*x_offset << pixel_shift) + 2*y_offset*h->mb_linesize;
         dest_cr += (2*x_offset << pixel_shift) + 2*y_offset*h->mb_linesize;
-    } else if (CHROMA422) {
+    } else if (chroma_idc == 2 /* yuv422 */) {
         chroma_height = height;
         dest_cb += (  x_offset << pixel_shift) + 2*y_offset*h->mb_uvlinesize;
         dest_cr += (  x_offset << pixel_shift) + 2*y_offset*h->mb_uvlinesize;
-    }else{
+    } else /* yuv420 */ {
         chroma_height = height >> 1;
         dest_cb += (  x_offset << pixel_shift) +   y_offset*h->mb_uvlinesize;
         dest_cr += (  x_offset << pixel_shift) +   y_offset*h->mb_uvlinesize;
@@ -610,10 +618,12 @@ static inline void mc_part_weighted(H264Context *h, int n, int square, int heigh
 
         mc_dir_part(h, &h->ref_list[0][refn0], n, square, height, delta, 0,
                     dest_y, dest_cb, dest_cr,
-                    x_offset, y_offset, qpix_put, chroma_put, pixel_shift, chroma444);
+                    x_offset, y_offset, qpix_put, chroma_put,
+                    pixel_shift, chroma_idc);
         mc_dir_part(h, &h->ref_list[1][refn1], n, square, height, delta, 1,
                     tmp_y, tmp_cb, tmp_cr,
-                    x_offset, y_offset, qpix_put, chroma_put, pixel_shift, chroma444);
+                    x_offset, y_offset, qpix_put, chroma_put,
+                    pixel_shift, chroma_idc);
 
         if(h->use_weight == 2){
             int weight0 = h->implicit_weight[refn0][refn1][s->mb_y&1];
@@ -641,7 +651,7 @@ static inline void mc_part_weighted(H264Context *h, int n, int square, int heigh
         Picture *ref= &h->ref_list[list][refn];
         mc_dir_part(h, ref, n, square, height, delta, list,
                     dest_y, dest_cb, dest_cr, x_offset, y_offset,
-                    qpix_put, chroma_put, pixel_shift, chroma444);
+                    qpix_put, chroma_put, pixel_shift, chroma_idc);
 
         luma_weight_op(dest_y, h->mb_linesize, height, h->luma_log2_weight_denom,
                        h->luma_weight[refn][list][0], h->luma_weight[refn][list][1]);
@@ -660,21 +670,22 @@ static inline void mc_part(H264Context *h, int n, int square, int height, int de
                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
                            h264_weight_func *weight_op, h264_biweight_func *weight_avg,
-                           int list0, int list1, int pixel_shift, int chroma444){
+                           int list0, int list1, int pixel_shift, int chroma_idc)
+{
     if((h->use_weight==2 && list0 && list1
         && (h->implicit_weight[ h->ref_cache[0][scan8[n]] ][ h->ref_cache[1][scan8[n]] ][h->s.mb_y&1] != 32))
        || h->use_weight==1)
         mc_part_weighted(h, n, square, height, delta, dest_y, dest_cb, dest_cr,
                          x_offset, y_offset, qpix_put, chroma_put,
                          weight_op[0], weight_op[1], weight_avg[0],
-                         weight_avg[1], list0, list1, pixel_shift, chroma444);
+                         weight_avg[1], list0, list1, pixel_shift, chroma_idc);
     else
         mc_part_std(h, n, square, height, delta, dest_y, dest_cb, dest_cr,
                     x_offset, y_offset, qpix_put, chroma_put, qpix_avg,
-                    chroma_avg, list0, list1, pixel_shift, chroma444);
+                    chroma_avg, list0, list1, pixel_shift, chroma_idc);
 }
 
-static inline void prefetch_motion(H264Context *h, int list, int pixel_shift, int chroma444){
+static inline void prefetch_motion(H264Context *h, int list, int pixel_shift, int chroma_idc){
     /* fetch pixels for estimated mv 4 macroblocks ahead
      * optimized for 64byte cache lines */
     MpegEncContext * const s = &h->s;
@@ -685,7 +696,7 @@ static inline void prefetch_motion(H264Context *h, int list, int pixel_shift, in
         uint8_t **src = h->ref_list[list][refn].f.data;
         int off= (mx << pixel_shift) + (my + (s->mb_x&3)*4)*h->mb_linesize + (64 << pixel_shift);
         s->dsp.prefetch(src[0]+off, s->linesize, 4);
-        if(chroma444){
+        if (chroma_idc == 3 /* yuv444 */) {
             s->dsp.prefetch(src[1]+off, s->linesize, 4);
             s->dsp.prefetch(src[2]+off, s->linesize, 4);
         }else{
@@ -699,7 +710,8 @@ static av_always_inline void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t
                       qpel_mc_func (*qpix_put)[16], h264_chroma_mc_func (*chroma_put),
                       qpel_mc_func (*qpix_avg)[16], h264_chroma_mc_func (*chroma_avg),
                       h264_weight_func *weight_op, h264_biweight_func *weight_avg,
-                      int pixel_shift, int chroma444){
+                      int pixel_shift, int chroma_idc)
+{
     MpegEncContext * const s = &h->s;
     const int mb_xy= h->mb_xy;
     const int mb_type = s->current_picture.f.mb_type[mb_xy];
@@ -708,36 +720,36 @@ static av_always_inline void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t
 
     if(HAVE_THREADS && (s->avctx->active_thread_type & FF_THREAD_FRAME))
         await_references(h);
-    prefetch_motion(h, 0, pixel_shift, chroma444);
+    prefetch_motion(h, 0, pixel_shift, chroma_idc);
 
     if(IS_16X16(mb_type)){
         mc_part(h, 0, 1, 16, 0, dest_y, dest_cb, dest_cr, 0, 0,
                 qpix_put[0], chroma_put[0], qpix_avg[0], chroma_avg[0],
                 weight_op, weight_avg,
                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1),
-                pixel_shift, chroma444);
+                pixel_shift, chroma_idc);
     }else if(IS_16X8(mb_type)){
         mc_part(h, 0, 0, 8, 8 << pixel_shift, dest_y, dest_cb, dest_cr, 0, 0,
                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
                 weight_op, weight_avg,
                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1),
-                pixel_shift, chroma444);
+                pixel_shift, chroma_idc);
         mc_part(h, 8, 0, 8, 8 << pixel_shift, dest_y, dest_cb, dest_cr, 0, 4,
                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
                 weight_op, weight_avg,
                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1),
-                pixel_shift, chroma444);
+                pixel_shift, chroma_idc);
     }else if(IS_8X16(mb_type)){
         mc_part(h, 0, 0, 16, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 0, 0,
                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
                 &weight_op[1], &weight_avg[1],
                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1),
-                pixel_shift, chroma444);
+                pixel_shift, chroma_idc);
         mc_part(h, 4, 0, 16, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 4, 0,
                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
                 &weight_op[1], &weight_avg[1],
                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1),
-                pixel_shift, chroma444);
+                pixel_shift, chroma_idc);
     }else{
         int i;
 
@@ -754,29 +766,29 @@ static av_always_inline void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t
                     qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
                     &weight_op[1], &weight_avg[1],
                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1),
-                    pixel_shift, chroma444);
+                    pixel_shift, chroma_idc);
             }else if(IS_SUB_8X4(sub_mb_type)){
                 mc_part(h, n  , 0, 4, 4 << pixel_shift, dest_y, dest_cb, dest_cr, x_offset, y_offset,
                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
                     &weight_op[1], &weight_avg[1],
                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1),
-                    pixel_shift, chroma444);
+                    pixel_shift, chroma_idc);
                 mc_part(h, n+2, 0, 4, 4 << pixel_shift, dest_y, dest_cb, dest_cr, x_offset, y_offset+2,
                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
                     &weight_op[1], &weight_avg[1],
                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1),
-                    pixel_shift, chroma444);
+                    pixel_shift, chroma_idc);
             }else if(IS_SUB_4X8(sub_mb_type)){
                 mc_part(h, n  , 0, 8, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset, y_offset,
                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
                     &weight_op[2], &weight_avg[2],
                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1),
-                    pixel_shift, chroma444);
+                    pixel_shift, chroma_idc);
                 mc_part(h, n+1, 0, 8, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset+2, y_offset,
                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
                     &weight_op[2], &weight_avg[2],
                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1),
-                    pixel_shift, chroma444);
+                    pixel_shift, chroma_idc);
             }else{
                 int j;
                 assert(IS_SUB_4X4(sub_mb_type));
@@ -787,13 +799,35 @@ static av_always_inline void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t
                         qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
                         &weight_op[2], &weight_avg[2],
                         IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1),
-                        pixel_shift, chroma444);
+                        pixel_shift, chroma_idc);
                 }
             }
         }
     }
 
-    prefetch_motion(h, 1, pixel_shift, chroma444);
+    prefetch_motion(h, 1, pixel_shift, chroma_idc);
+}
+
+static av_always_inline void
+hl_motion_420(H264Context *h, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
+              qpel_mc_func (*qpix_put)[16], h264_chroma_mc_func (*chroma_put),
+              qpel_mc_func (*qpix_avg)[16], h264_chroma_mc_func (*chroma_avg),
+              h264_weight_func *weight_op, h264_biweight_func *weight_avg,
+              int pixel_shift)
+{
+    hl_motion(h, dest_y, dest_cb, dest_cr, qpix_put, chroma_put,
+              qpix_avg, chroma_avg, weight_op, weight_avg, pixel_shift, 1);
+}
+
+static av_always_inline void
+hl_motion_422(H264Context *h, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
+              qpel_mc_func (*qpix_put)[16], h264_chroma_mc_func (*chroma_put),
+              qpel_mc_func (*qpix_avg)[16], h264_chroma_mc_func (*chroma_avg),
+              h264_weight_func *weight_op, h264_biweight_func *weight_avg,
+              int pixel_shift)
+{
+    hl_motion(h, dest_y, dest_cb, dest_cr, qpix_put, chroma_put,
+              qpix_avg, chroma_avg, weight_op, weight_avg, pixel_shift, 2);
 }
 
 static void free_tables(H264Context *h, int free_rbsp){
@@ -1798,7 +1832,8 @@ static av_always_inline void hl_decode_mb_idct_luma(H264Context *h, int mb_type,
     }
 }
 
-static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple, int pixel_shift){
+static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple, int pixel_shift)
+{
     MpegEncContext * const s = &h->s;
     const int mb_x= s->mb_x;
     const int mb_y= s->mb_y;
@@ -1813,6 +1848,7 @@ static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple, i
     const int is_h264 = !CONFIG_SVQ3_DECODER || simple || s->codec_id == CODEC_ID_H264;
     void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride);
     const int block_h = 16 >> s->chroma_y_shift;
+    const int chroma422 = CHROMA422;
 
     dest_y  = s->current_picture.f.data[0] + ((mb_x << pixel_shift) + mb_y * s->linesize  ) * 16;
     dest_cb = s->current_picture.f.data[1] + (mb_x << pixel_shift)*8 + mb_y * s->uvlinesize * block_h;
@@ -1927,11 +1963,21 @@ static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple, i
             if(h->deblocking_filter)
                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0, 0, simple, pixel_shift);
         }else if(is_h264){
-            hl_motion(h, dest_y, dest_cb, dest_cr,
-                      s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab,
-                      s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab,
-                      h->h264dsp.weight_h264_pixels_tab,
-                      h->h264dsp.biweight_h264_pixels_tab, pixel_shift, 0);
+            if (chroma422) {
+                hl_motion_422(h, dest_y, dest_cb, dest_cr,
+                              s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab,
+                              s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab,
+                              h->h264dsp.weight_h264_pixels_tab,
+                              h->h264dsp.biweight_h264_pixels_tab,
+                              pixel_shift);
+            } else {
+                hl_motion_420(h, dest_y, dest_cb, dest_cr,
+                              s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab,
+                              s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab,
+                              h->h264dsp.weight_h264_pixels_tab,
+                              h->h264dsp.biweight_h264_pixels_tab,
+                              pixel_shift);
+            }
         }
 
         hl_decode_mb_idct_luma(h, mb_type, is_h264, simple, transform_bypass, pixel_shift, block_offset, linesize, dest_y, 0);
@@ -1949,7 +1995,7 @@ static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple, i
                             if(h->non_zero_count_cache[ scan8[i] ] || dctcoef_get(h->mb, pixel_shift, i*16))
                                 idct_add   (dest[j-1] + block_offset[i], h->mb + (i*16 << pixel_shift), uvlinesize);
                         }
-                        if (CHROMA422) {
+                        if (chroma422) {
                             for(i=j*16+4; i<j*16+8; i++){
                                 if(h->non_zero_count_cache[ scan8[i] ] || dctcoef_get(h->mb, pixel_shift, i*16))
                                     idct_add   (dest[j-1] + block_offset[i+4], h->mb + (i*16 << pixel_shift), uvlinesize);
@@ -1960,7 +2006,7 @@ static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple, i
             }else{
                 if(is_h264){
                     int qp[2];
-                    if (CHROMA422) {
+                    if (chroma422) {
                         qp[0] = h->chroma_qp[0] + 3;
                         qp[1] = h->chroma_qp[1] + 3;
                     } else {
@@ -2079,7 +2125,7 @@ static av_always_inline void hl_decode_mb_444_internal(H264Context *h, int simpl
                       s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab,
                       s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab,
                       h->h264dsp.weight_h264_pixels_tab,
-                      h->h264dsp.biweight_h264_pixels_tab, pixel_shift, 1);
+                      h->h264dsp.biweight_h264_pixels_tab, pixel_shift, 3);
         }
 
         for (p = 0; p < plane_count; p++)

From 27209bb108c8a3d2c0de2c36dfb973667df24017 Mon Sep 17 00:00:00 2001
From: "Ronald S. Bultje" <rsbultje@gmail.com>
Date: Thu, 20 Oct 2011 23:36:23 -0700
Subject: [PATCH 09/35] h264: mark some MC functions with av_always_inline
 instead of inline.

This actually causes them to be inlined, leading to a significant
speedup (1-1.5% in my measurements).
---
 libavcodec/h264.c | 60 ++++++++++++++++++++++++++---------------------
 1 file changed, 33 insertions(+), 27 deletions(-)

diff --git a/libavcodec/h264.c b/libavcodec/h264.c
index 0525df3712..17124c3088 100644
--- a/libavcodec/h264.c
+++ b/libavcodec/h264.c
@@ -438,12 +438,13 @@ static void chroma_dc_dct_c(DCTELEM *block){
 }
 #endif
 
-static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square,
-                               int height, int delta, int list,
-                               uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
-                               int src_x_offset, int src_y_offset,
-                               qpel_mc_func *qpix_op, h264_chroma_mc_func chroma_op,
-                               int pixel_shift, int chroma_idc)
+static av_always_inline void
+mc_dir_part(H264Context *h, Picture *pic, int n, int square,
+            int height, int delta, int list,
+            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
+            int src_x_offset, int src_y_offset,
+            qpel_mc_func *qpix_op, h264_chroma_mc_func chroma_op,
+            int pixel_shift, int chroma_idc)
 {
     MpegEncContext * const s = &h->s;
     const int mx= h->mv_cache[list][ scan8[n] ][0] + src_x_offset*8;
@@ -535,12 +536,13 @@ static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square,
               mx&7, (my << (chroma_idc == 2 /* yuv422 */)) &7);
 }
 
-static inline void mc_part_std(H264Context *h, int n, int square, int height, int delta,
-                           uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
-                           int x_offset, int y_offset,
-                           qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
-                           qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
-                           int list0, int list1, int pixel_shift, int chroma_idc)
+static av_always_inline void
+mc_part_std(H264Context *h, int n, int square, int height, int delta,
+            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
+            int x_offset, int y_offset,
+            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
+            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
+            int list0, int list1, int pixel_shift, int chroma_idc)
 {
     MpegEncContext * const s = &h->s;
     qpel_mc_func *qpix_op=  qpix_put;
@@ -578,13 +580,14 @@ static inline void mc_part_std(H264Context *h, int n, int square, int height, in
     }
 }
 
-static inline void mc_part_weighted(H264Context *h, int n, int square, int height, int delta,
-                           uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
-                           int x_offset, int y_offset,
-                           qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
-                           h264_weight_func luma_weight_op, h264_weight_func chroma_weight_op,
-                           h264_biweight_func luma_weight_avg, h264_biweight_func chroma_weight_avg,
-                           int list0, int list1, int pixel_shift, int chroma_idc){
+static av_always_inline void
+mc_part_weighted(H264Context *h, int n, int square, int height, int delta,
+                 uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
+                 int x_offset, int y_offset,
+                 qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
+                 h264_weight_func luma_weight_op, h264_weight_func chroma_weight_op,
+                 h264_biweight_func luma_weight_avg, h264_biweight_func chroma_weight_avg,
+                 int list0, int list1, int pixel_shift, int chroma_idc){
     MpegEncContext * const s = &h->s;
     int chroma_height;
 
@@ -664,13 +667,14 @@ static inline void mc_part_weighted(H264Context *h, int n, int square, int heigh
     }
 }
 
-static inline void mc_part(H264Context *h, int n, int square, int height, int delta,
-                           uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
-                           int x_offset, int y_offset,
-                           qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
-                           qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
-                           h264_weight_func *weight_op, h264_biweight_func *weight_avg,
-                           int list0, int list1, int pixel_shift, int chroma_idc)
+static av_always_inline void
+mc_part(H264Context *h, int n, int square, int height, int delta,
+        uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
+        int x_offset, int y_offset,
+        qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
+        qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
+        h264_weight_func *weight_op, h264_biweight_func *weight_avg,
+        int list0, int list1, int pixel_shift, int chroma_idc)
 {
     if((h->use_weight==2 && list0 && list1
         && (h->implicit_weight[ h->ref_cache[0][scan8[n]] ][ h->ref_cache[1][scan8[n]] ][h->s.mb_y&1] != 32))
@@ -685,7 +689,9 @@ static inline void mc_part(H264Context *h, int n, int square, int height, int de
                     chroma_avg, list0, list1, pixel_shift, chroma_idc);
 }
 
-static inline void prefetch_motion(H264Context *h, int list, int pixel_shift, int chroma_idc){
+static av_always_inline void
+prefetch_motion(H264Context *h, int list, int pixel_shift, int chroma_idc)
+{
     /* fetch pixels for estimated mv 4 macroblocks ahead
      * optimized for 64byte cache lines */
     MpegEncContext * const s = &h->s;

From b8bb9c026789ca9cd6d7a3a6263fc6e8a3467767 Mon Sep 17 00:00:00 2001
From: Jean First <jeanfirst@gmail.com>
Date: Fri, 30 Sep 2011 09:42:45 +0200
Subject: [PATCH 10/35] Enable multithreding when decoding with libopenjpeg

Enable multithreding when decoding with libopenjpeg

Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
Signed-off-by: Ronald S. Bultje <rsbultje@gmail.com>
---
 libavcodec/libopenjpeg.c | 45 ++++++++++++++++++++++++++++++++--------
 1 file changed, 36 insertions(+), 9 deletions(-)

diff --git a/libavcodec/libopenjpeg.c b/libavcodec/libopenjpeg.c
index 42809b992f..1facd21044 100644
--- a/libavcodec/libopenjpeg.c
+++ b/libavcodec/libopenjpeg.c
@@ -27,6 +27,7 @@
 #include "libavutil/imgutils.h"
 #include "avcodec.h"
 #include "libavutil/intreadwrite.h"
+#include "thread.h"
 #define  OPJ_STATIC
 #include <openjpeg.h>
 
@@ -57,6 +58,14 @@ static av_cold int libopenjpeg_decode_init(AVCodecContext *avctx)
     return 0;
 }
 
+static av_cold int libopenjpeg_decode_init_thread_copy(AVCodecContext *avctx)
+{
+    LibOpenJPEGContext *ctx = avctx->priv_data;
+
+    avctx->coded_frame = &ctx->image;
+    return 0;
+}
+
 static int libopenjpeg_decode_frame(AVCodecContext *avctx,
                                     void *data, int *data_size,
                                     AVPacket *avpkt)
@@ -94,7 +103,7 @@ static int libopenjpeg_decode_frame(AVCodecContext *avctx,
     }
     opj_set_event_mgr((opj_common_ptr)dec, NULL, NULL);
 
-    ctx->dec_params.cp_reduce = avctx->lowres;
+    ctx->dec_params.cp_limit_decoding = LIMIT_TO_MAIN_HEADER;
     // Tie decoder with decoding parameters
     opj_setup_decoder(dec, &ctx->dec_params);
     stream = opj_cio_open((opj_common_ptr)dec, buf, buf_size);
@@ -104,7 +113,7 @@ static int libopenjpeg_decode_frame(AVCodecContext *avctx,
         return -1;
     }
 
-    // Decode the codestream
+    // Decode the header only
     image = opj_decode_with_info(dec, stream, NULL);
     opj_cio_close(stream);
     if(!image) {
@@ -112,8 +121,8 @@ static int libopenjpeg_decode_frame(AVCodecContext *avctx,
         opj_destroy_decompress(dec);
         return -1;
     }
-    width  = image->comps[0].w << avctx->lowres;
-    height = image->comps[0].h << avctx->lowres;
+    width  = image->x1 - image->x0;
+    height = image->y1 - image->y0;
     if(av_image_check_size(width, height, 0, avctx) < 0) {
         av_log(avctx, AV_LOG_ERROR, "%dx%d dimension invalid.\n", width, height);
         goto done;
@@ -139,13 +148,30 @@ static int libopenjpeg_decode_frame(AVCodecContext *avctx,
     }
 
     if(picture->data[0])
-        avctx->release_buffer(avctx, picture);
+        ff_thread_release_buffer(avctx, picture);
+
+    if(ff_thread_get_buffer(avctx, picture) < 0){
+        av_log(avctx, AV_LOG_ERROR, "ff_thread_get_buffer() failed\n");
+        return -1;
+    }
 
-    if(avctx->get_buffer(avctx, picture) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "Couldn't allocate image buffer.\n");
+    ff_thread_finish_setup(avctx);
+
+    ctx->dec_params.cp_limit_decoding = NO_LIMITATION;
+    ctx->dec_params.cp_reduce = avctx->lowres;
+    // Tie decoder with decoding parameters
+    opj_setup_decoder(dec, &ctx->dec_params);
+    stream = opj_cio_open((opj_common_ptr)dec, buf, buf_size);
+    if(!stream) {
+        av_log(avctx, AV_LOG_ERROR, "Codestream could not be opened for reading.\n");
+        opj_destroy_decompress(dec);
         return -1;
     }
 
+    // Decode the codestream
+    image = opj_decode_with_info(dec, stream, NULL);
+    opj_cio_close(stream);
+
     for(x = 0; x < image->numcomps; x++) {
         adjust[x] = FFMAX(image->comps[x].prec - 8, 0);
     }
@@ -179,7 +205,7 @@ static av_cold int libopenjpeg_decode_close(AVCodecContext *avctx)
     LibOpenJPEGContext *ctx = avctx->priv_data;
 
     if(ctx->image.data[0])
-        avctx->release_buffer(avctx, &ctx->image);
+        ff_thread_release_buffer(avctx, &ctx->image);
     return 0 ;
 }
 
@@ -192,7 +218,8 @@ AVCodec ff_libopenjpeg_decoder = {
     .init           = libopenjpeg_decode_init,
     .close          = libopenjpeg_decode_close,
     .decode         = libopenjpeg_decode_frame,
-    .capabilities   = CODEC_CAP_DR1,
+    .capabilities   = CODEC_CAP_DR1 | CODEC_CAP_FRAME_THREADS,
     .max_lowres     = 5,
     .long_name      = NULL_IF_CONFIG_SMALL("OpenJPEG based JPEG 2000 decoder"),
+    .init_thread_copy = ONLY_IF_THREADS_ENABLED(libopenjpeg_decode_init_thread_copy)
 };

From b034c95cc1a700b7d0849cedb1316989c3fb15be Mon Sep 17 00:00:00 2001
From: Mans Rullgard <mans@mansr.com>
Date: Fri, 21 Oct 2011 12:34:08 +0100
Subject: [PATCH 11/35] h264: fix ppc/altivec build

Signed-off-by: Mans Rullgard <mans@mansr.com>
---
 libavcodec/ppc/h264_altivec.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libavcodec/ppc/h264_altivec.c b/libavcodec/ppc/h264_altivec.c
index edc043c3c7..3163a37d3b 100644
--- a/libavcodec/ppc/h264_altivec.c
+++ b/libavcodec/ppc/h264_altivec.c
@@ -956,11 +956,11 @@ void biweight_h264_W_altivec(uint8_t *dst, uint8_t *src, int stride, int height,
 #define H264_WEIGHT(W) \
 static void ff_weight_h264_pixels ## W ## _altivec(uint8_t *block, int stride, int height, \
                                                    int log2_denom, int weight, int offset){ \
-    weight_h264_WxH_altivec(block, stride, height, log2_denom, weight, offset, W); \
+    weight_h264_W_altivec(block, stride, height, log2_denom, weight, offset, W); \
 }\
 static void ff_biweight_h264_pixels ## W ## _altivec(uint8_t *dst, uint8_t *src, int stride, int height, \
                                                      int log2_denom, int weightd, int weights, int offset){ \
-    biweight_h264_WxH_altivec(dst, src, stride, height, log2_denom, weightd, weights, offset, W); \
+    biweight_h264_W_altivec(dst, src, stride, height, log2_denom, weightd, weights, offset, W); \
 }
 
 H264_WEIGHT(16)

From ef74e3979930d99830e01a52b0e09f6997938696 Mon Sep 17 00:00:00 2001
From: Justin Ruggles <justin.ruggles@gmail.com>
Date: Thu, 20 Oct 2011 15:08:48 -0400
Subject: [PATCH 12/35] flvenc: store delay and last_ts per-stream.

---
 libavformat/flvenc.c | 34 ++++++++++++++++++++++------------
 1 file changed, 22 insertions(+), 12 deletions(-)

diff --git a/libavformat/flvenc.c b/libavformat/flvenc.c
index cf77157636..0d1fa7312a 100644
--- a/libavformat/flvenc.c
+++ b/libavformat/flvenc.c
@@ -57,10 +57,13 @@ typedef struct FLVContext {
     int64_t duration_offset;
     int64_t filesize_offset;
     int64_t duration;
-    int delay; ///< first dts delay for AVC
-    int64_t last_ts;
 } FLVContext;
 
+typedef struct FLVStreamContext {
+    int     delay;      ///< first dts delay for each stream (needed for AVC & Speex)
+    int64_t last_ts;    ///< last timestamp for each stream
+} FLVStreamContext;
+
 static int get_audio_flags(AVCodecContext *enc){
     int flags = (enc->bits_per_coded_sample == 16) ? FLV_SAMPLESSIZE_16BIT : FLV_SAMPLESSIZE_8BIT;
 
@@ -179,6 +182,7 @@ static int flv_write_header(AVFormatContext *s)
 
     for(i=0; i<s->nb_streams; i++){
         AVCodecContext *enc = s->streams[i]->codec;
+        FLVStreamContext *sc;
         if (enc->codec_type == AVMEDIA_TYPE_VIDEO) {
             if (s->streams[i]->r_frame_rate.den && s->streams[i]->r_frame_rate.num) {
                 framerate = av_q2d(s->streams[i]->r_frame_rate);
@@ -196,6 +200,12 @@ static int flv_write_header(AVFormatContext *s)
                 return -1;
         }
         av_set_pts_info(s->streams[i], 32, 1, 1000); /* 32 bit pts in ms */
+
+        sc = av_mallocz(sizeof(FLVStreamContext));
+        if (!sc)
+            return AVERROR(ENOMEM);
+        s->streams[i]->priv_data = sc;
+        sc->last_ts = -1;
     }
     avio_write(pb, "FLV", 3);
     avio_w8(pb,1);
@@ -215,8 +225,6 @@ static int flv_write_header(AVFormatContext *s)
         }
     }
 
-    flv->last_ts = -1;
-
     /* write meta_tag */
     avio_w8(pb, 18);         // tag type META
     metadata_size_pos= avio_tell(pb);
@@ -342,9 +350,10 @@ static int flv_write_trailer(AVFormatContext *s)
     /* Add EOS tag */
     for (i = 0; i < s->nb_streams; i++) {
         AVCodecContext *enc = s->streams[i]->codec;
+        FLVStreamContext *sc = s->streams[i]->priv_data;
         if (enc->codec_type == AVMEDIA_TYPE_VIDEO &&
                 enc->codec_id == CODEC_ID_H264) {
-            put_avc_eos_tag(pb, flv->last_ts);
+            put_avc_eos_tag(pb, sc->last_ts);
         }
     }
 
@@ -365,6 +374,7 @@ static int flv_write_packet(AVFormatContext *s, AVPacket *pkt)
     AVIOContext *pb = s->pb;
     AVCodecContext *enc = s->streams[pkt->stream_index]->codec;
     FLVContext *flv = s->priv_data;
+    FLVStreamContext *sc = s->streams[pkt->stream_index]->priv_data;
     unsigned ts;
     int size= pkt->size;
     uint8_t *data= NULL;
@@ -406,20 +416,20 @@ static int flv_write_packet(AVFormatContext *s, AVPacket *pkt)
                 return -1;
         }
     }
-    if (!flv->delay && pkt->dts < 0)
-        flv->delay = -pkt->dts;
+    if (!sc->delay && pkt->dts < 0)
+        sc->delay = -pkt->dts;
 
-    ts = pkt->dts + flv->delay; // add delay to force positive dts
+    ts = pkt->dts + sc->delay; // add delay to force positive dts
 
     /* check Speex packet duration */
-    if (enc->codec_id == CODEC_ID_SPEEX && ts - flv->last_ts > 160) {
+    if (enc->codec_id == CODEC_ID_SPEEX && ts - sc->last_ts > 160) {
         av_log(s, AV_LOG_WARNING, "Warning: Speex stream has more than "
                                   "8 frames per packet. Adobe Flash "
                                   "Player cannot handle this!\n");
     }
 
-    if (flv->last_ts < ts)
-        flv->last_ts = ts;
+    if (sc->last_ts < ts)
+        sc->last_ts = ts;
 
     avio_wb24(pb,size + flags_size);
     avio_wb24(pb,ts);
@@ -440,7 +450,7 @@ static int flv_write_packet(AVFormatContext *s, AVPacket *pkt)
     avio_write(pb, data ? data : pkt->data, size);
 
     avio_wb32(pb,size+flags_size+11); // previous tag size
-    flv->duration = FFMAX(flv->duration, pkt->pts + flv->delay + pkt->duration);
+    flv->duration = FFMAX(flv->duration, pkt->pts + sc->delay + pkt->duration);
 
     avio_flush(pb);
 

From 45add995de6a1458cd8095abb302f9a7cbd3e3ee Mon Sep 17 00:00:00 2001
From: Justin Ruggles <justin.ruggles@gmail.com>
Date: Sun, 9 Oct 2011 16:30:11 -0400
Subject: [PATCH 13/35] fmtconvert: fix and extend documentation for
 float_interleave()

---
 libavcodec/fmtconvert.h | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/libavcodec/fmtconvert.h b/libavcodec/fmtconvert.h
index d7741135b7..1b534019f1 100644
--- a/libavcodec/fmtconvert.h
+++ b/libavcodec/fmtconvert.h
@@ -70,7 +70,15 @@ typedef struct FmtConvertContext {
                                       long len, int channels);
 
     /**
-     * Convert an array of interleaved float to multiple arrays of float.
+     * Convert multiple arrays of float to an array of interleaved float.
+     *
+     * @param dst destination array of interleaved float.
+     *            constraints: 16-byte aligned
+     * @param src source array of float arrays, one for each channel.
+     *            constraints: 16-byte aligned
+     * @param len number of elements to convert.
+     *            constraints: multiple of 8
+     * @param channels number of channels
      */
     void (*float_interleave)(float *dst, const float **src, unsigned int len,
                              int channels);

From 708ab7dd69d5c98221882a8086f68f1bb02a44a3 Mon Sep 17 00:00:00 2001
From: Justin Ruggles <justin.ruggles@gmail.com>
Date: Sun, 9 Oct 2011 19:12:09 -0400
Subject: [PATCH 14/35] fmtconvert: port float_to_int16() x86 inline asm to
 yasm

---
 libavcodec/x86/fmtconvert.asm   | 42 ++++++++++++++++
 libavcodec/x86/fmtconvert_mmx.c | 89 +++++----------------------------
 2 files changed, 55 insertions(+), 76 deletions(-)

diff --git a/libavcodec/x86/fmtconvert.asm b/libavcodec/x86/fmtconvert.asm
index efab87d570..d314a4e14e 100644
--- a/libavcodec/x86/fmtconvert.asm
+++ b/libavcodec/x86/fmtconvert.asm
@@ -24,6 +24,48 @@
 
 SECTION_TEXT
 
+;------------------------------------------------------------------------------
+; void ff_float_to_int16(int16_t *dst, const float *src, long len);
+;------------------------------------------------------------------------------
+%macro FLOAT_TO_INT16 2
+cglobal float_to_int16_%1, 3,3,%2, dst, src, len
+    add       lenq, lenq
+    lea       srcq, [srcq+2*lenq]
+    add       dstq, lenq
+    neg       lenq
+.loop:
+%ifidn %1, sse2
+    cvtps2dq    m0, [srcq+2*lenq   ]
+    cvtps2dq    m1, [srcq+2*lenq+16]
+    packssdw    m0, m1
+    mova  [dstq+lenq], m0
+%else
+    cvtps2pi    m0, [srcq+2*lenq   ]
+    cvtps2pi    m1, [srcq+2*lenq+ 8]
+    cvtps2pi    m2, [srcq+2*lenq+16]
+    cvtps2pi    m3, [srcq+2*lenq+24]
+    packssdw    m0, m1
+    packssdw    m2, m3
+    mova  [dstq+lenq  ], m0
+    mova  [dstq+lenq+8], m2
+%endif
+    add       lenq, 16
+    js .loop
+%ifnidn %1, sse2
+    emms
+%endif
+    REP_RET
+%endmacro
+
+INIT_XMM
+FLOAT_TO_INT16 sse2, 2
+INIT_MMX
+FLOAT_TO_INT16 sse, 0
+%define cvtps2pi pf2id
+FLOAT_TO_INT16 3dnow, 0
+%undef cvtps2pi
+
+
 %macro PSWAPD_SSE 2
     pshufw %1, %2, 0x4e
 %endmacro
diff --git a/libavcodec/x86/fmtconvert_mmx.c b/libavcodec/x86/fmtconvert_mmx.c
index 253f60bfc2..949dc973f3 100644
--- a/libavcodec/x86/fmtconvert_mmx.c
+++ b/libavcodec/x86/fmtconvert_mmx.c
@@ -70,80 +70,16 @@ static void int32_to_float_fmul_scalar_sse2(float *dst, const int *src, float mu
     );
 }
 
-static void float_to_int16_3dnow(int16_t *dst, const float *src, long len){
-    x86_reg reglen = len;
-    // not bit-exact: pf2id uses different rounding than C and SSE
-    __asm__ volatile(
-        "add        %0          , %0        \n\t"
-        "lea         (%2,%0,2)  , %2        \n\t"
-        "add        %0          , %1        \n\t"
-        "neg        %0                      \n\t"
-        "1:                                 \n\t"
-        "pf2id       (%2,%0,2)  , %%mm0     \n\t"
-        "pf2id      8(%2,%0,2)  , %%mm1     \n\t"
-        "pf2id     16(%2,%0,2)  , %%mm2     \n\t"
-        "pf2id     24(%2,%0,2)  , %%mm3     \n\t"
-        "packssdw   %%mm1       , %%mm0     \n\t"
-        "packssdw   %%mm3       , %%mm2     \n\t"
-        "movq       %%mm0       ,  (%1,%0)  \n\t"
-        "movq       %%mm2       , 8(%1,%0)  \n\t"
-        "add        $16         , %0        \n\t"
-        " js 1b                             \n\t"
-        "femms                              \n\t"
-        :"+r"(reglen), "+r"(dst), "+r"(src)
-    );
-}
+#if HAVE_YASM
 
-static void float_to_int16_sse(int16_t *dst, const float *src, long len){
-    x86_reg reglen = len;
-    __asm__ volatile(
-        "add        %0          , %0        \n\t"
-        "lea         (%2,%0,2)  , %2        \n\t"
-        "add        %0          , %1        \n\t"
-        "neg        %0                      \n\t"
-        "1:                                 \n\t"
-        "cvtps2pi    (%2,%0,2)  , %%mm0     \n\t"
-        "cvtps2pi   8(%2,%0,2)  , %%mm1     \n\t"
-        "cvtps2pi  16(%2,%0,2)  , %%mm2     \n\t"
-        "cvtps2pi  24(%2,%0,2)  , %%mm3     \n\t"
-        "packssdw   %%mm1       , %%mm0     \n\t"
-        "packssdw   %%mm3       , %%mm2     \n\t"
-        "movq       %%mm0       ,  (%1,%0)  \n\t"
-        "movq       %%mm2       , 8(%1,%0)  \n\t"
-        "add        $16         , %0        \n\t"
-        " js 1b                             \n\t"
-        "emms                               \n\t"
-        :"+r"(reglen), "+r"(dst), "+r"(src)
-    );
-}
-
-static void float_to_int16_sse2(int16_t *dst, const float *src, long len){
-    x86_reg reglen = len;
-    __asm__ volatile(
-        "add        %0          , %0        \n\t"
-        "lea         (%2,%0,2)  , %2        \n\t"
-        "add        %0          , %1        \n\t"
-        "neg        %0                      \n\t"
-        "1:                                 \n\t"
-        "cvtps2dq    (%2,%0,2)  , %%xmm0    \n\t"
-        "cvtps2dq  16(%2,%0,2)  , %%xmm1    \n\t"
-        "packssdw   %%xmm1      , %%xmm0    \n\t"
-        "movdqa     %%xmm0      ,  (%1,%0)  \n\t"
-        "add        $16         , %0        \n\t"
-        " js 1b                             \n\t"
-        :"+r"(reglen), "+r"(dst), "+r"(src)
-    );
-}
+void ff_float_to_int16_3dnow(int16_t *dst, const float *src, long len);
+void ff_float_to_int16_sse  (int16_t *dst, const float *src, long len);
+void ff_float_to_int16_sse2 (int16_t *dst, const float *src, long len);
 
 void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len);
 void ff_float_to_int16_interleave6_3dnow(int16_t *dst, const float **src, int len);
 void ff_float_to_int16_interleave6_3dn2(int16_t *dst, const float **src, int len);
 
-#if !HAVE_YASM
-#define ff_float_to_int16_interleave6_sse(a,b,c)   float_to_int16_interleave_misc_sse(a,b,c,6)
-#define ff_float_to_int16_interleave6_3dnow(a,b,c) float_to_int16_interleave_misc_3dnow(a,b,c,6)
-#define ff_float_to_int16_interleave6_3dn2(a,b,c)  float_to_int16_interleave_misc_3dnow(a,b,c,6)
-#endif
 #define ff_float_to_int16_interleave6_sse2 ff_float_to_int16_interleave6_sse
 
 #define FLOAT_TO_INT16_INTERLEAVE(cpu, body) \
@@ -152,7 +88,7 @@ static av_noinline void float_to_int16_interleave_misc_##cpu(int16_t *dst, const
     DECLARE_ALIGNED(16, int16_t, tmp)[len];\
     int i,j,c;\
     for(c=0; c<channels; c++){\
-        float_to_int16_##cpu(tmp, src[c], len);\
+        ff_float_to_int16_##cpu(tmp, src[c], len);\
         for(i=0, j=c; i<len; i++, j+=channels)\
             dst[j] = tmp[i];\
     }\
@@ -160,7 +96,7 @@ static av_noinline void float_to_int16_interleave_misc_##cpu(int16_t *dst, const
 \
 static void float_to_int16_interleave_##cpu(int16_t *dst, const float **src, long len, int channels){\
     if(channels==1)\
-        float_to_int16_##cpu(dst, src[0], len);\
+        ff_float_to_int16_##cpu(dst, src[0], len);\
     else if(channels==2){\
         x86_reg reglen = len; \
         const float *src0 = src[0];\
@@ -235,7 +171,6 @@ static void float_to_int16_interleave_3dn2(int16_t *dst, const float **src, long
         float_to_int16_interleave_3dnow(dst, src, len, channels);
 }
 
-#if HAVE_YASM
 void ff_float_interleave2_mmx(float *dst, const float **src, unsigned int len);
 void ff_float_interleave2_sse(float *dst, const float **src, unsigned int len);
 
@@ -272,11 +207,10 @@ void ff_fmt_convert_init_x86(FmtConvertContext *c, AVCodecContext *avctx)
     if (mm_flags & AV_CPU_FLAG_MMX) {
 #if HAVE_YASM
         c->float_interleave = float_interleave_mmx;
-#endif
 
         if(mm_flags & AV_CPU_FLAG_3DNOW){
             if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
-                c->float_to_int16 = float_to_int16_3dnow;
+                c->float_to_int16 = ff_float_to_int16_3dnow;
                 c->float_to_int16_interleave = float_to_int16_interleave_3dnow;
             }
         }
@@ -285,18 +219,21 @@ void ff_fmt_convert_init_x86(FmtConvertContext *c, AVCodecContext *avctx)
                 c->float_to_int16_interleave = float_to_int16_interleave_3dn2;
             }
         }
+#endif
         if(mm_flags & AV_CPU_FLAG_SSE){
             c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse;
-            c->float_to_int16 = float_to_int16_sse;
-            c->float_to_int16_interleave = float_to_int16_interleave_sse;
 #if HAVE_YASM
+            c->float_to_int16 = ff_float_to_int16_sse;
+            c->float_to_int16_interleave = float_to_int16_interleave_sse;
             c->float_interleave = float_interleave_sse;
 #endif
         }
         if(mm_flags & AV_CPU_FLAG_SSE2){
             c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse2;
-            c->float_to_int16 = float_to_int16_sse2;
+#if HAVE_YASM
+            c->float_to_int16 = ff_float_to_int16_sse2;
             c->float_to_int16_interleave = float_to_int16_interleave_sse2;
+#endif
         }
     }
 }

From 185142a5ea93ef723f70a3ea43797f6c8827eb79 Mon Sep 17 00:00:00 2001
From: Justin Ruggles <justin.ruggles@gmail.com>
Date: Sun, 9 Oct 2011 20:01:22 -0400
Subject: [PATCH 15/35] fmtconvert: check compile-time x86 instruction set
 flags

---
 libavcodec/x86/fmtconvert_mmx.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/libavcodec/x86/fmtconvert_mmx.c b/libavcodec/x86/fmtconvert_mmx.c
index 949dc973f3..6e43280d66 100644
--- a/libavcodec/x86/fmtconvert_mmx.c
+++ b/libavcodec/x86/fmtconvert_mmx.c
@@ -208,19 +208,19 @@ void ff_fmt_convert_init_x86(FmtConvertContext *c, AVCodecContext *avctx)
 #if HAVE_YASM
         c->float_interleave = float_interleave_mmx;
 
-        if(mm_flags & AV_CPU_FLAG_3DNOW){
+        if (HAVE_AMD3DNOW && mm_flags & AV_CPU_FLAG_3DNOW) {
             if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
                 c->float_to_int16 = ff_float_to_int16_3dnow;
                 c->float_to_int16_interleave = float_to_int16_interleave_3dnow;
             }
         }
-        if(mm_flags & AV_CPU_FLAG_3DNOWEXT){
+        if (HAVE_AMD3DNOWEXT && mm_flags & AV_CPU_FLAG_3DNOWEXT) {
             if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
                 c->float_to_int16_interleave = float_to_int16_interleave_3dn2;
             }
         }
 #endif
-        if(mm_flags & AV_CPU_FLAG_SSE){
+        if (HAVE_SSE && mm_flags & AV_CPU_FLAG_SSE) {
             c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse;
 #if HAVE_YASM
             c->float_to_int16 = ff_float_to_int16_sse;
@@ -228,7 +228,7 @@ void ff_fmt_convert_init_x86(FmtConvertContext *c, AVCodecContext *avctx)
             c->float_interleave = float_interleave_sse;
 #endif
         }
-        if(mm_flags & AV_CPU_FLAG_SSE2){
+        if (HAVE_SSE && mm_flags & AV_CPU_FLAG_SSE2) {
             c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse2;
 #if HAVE_YASM
             c->float_to_int16 = ff_float_to_int16_sse2;

From 4e8e2624767f4af0eaa932c543d072fed96fd586 Mon Sep 17 00:00:00 2001
From: Justin Ruggles <justin.ruggles@gmail.com>
Date: Sun, 9 Oct 2011 23:52:03 -0400
Subject: [PATCH 16/35] fmtconvert: port int32_to_float_fmul_scalar() x86
 inline asm to yasm

---
 libavcodec/x86/dsputil_yasm.asm |  8 -----
 libavcodec/x86/fmtconvert.asm   | 46 +++++++++++++++++++++++++
 libavcodec/x86/fmtconvert_mmx.c | 59 ++++-----------------------------
 libavutil/x86/x86util.asm       | 12 +++++++
 4 files changed, 65 insertions(+), 60 deletions(-)

diff --git a/libavcodec/x86/dsputil_yasm.asm b/libavcodec/x86/dsputil_yasm.asm
index 2a2108404a..fe96d8b12b 100644
--- a/libavcodec/x86/dsputil_yasm.asm
+++ b/libavcodec/x86/dsputil_yasm.asm
@@ -1055,14 +1055,6 @@ emu_edge mmx
 ;                           int32_t max, unsigned int len)
 ;-----------------------------------------------------------------------------
 
-%macro SPLATD_MMX 1
-    punpckldq  %1, %1
-%endmacro
-
-%macro SPLATD_SSE2 1
-    pshufd  %1, %1, 0
-%endmacro
-
 %macro VECTOR_CLIP_INT32 4
 cglobal vector_clip_int32_%1, 5,5,%2, dst, src, min, max, len
 %ifidn %1, sse2
diff --git a/libavcodec/x86/fmtconvert.asm b/libavcodec/x86/fmtconvert.asm
index d314a4e14e..e3eb5d2286 100644
--- a/libavcodec/x86/fmtconvert.asm
+++ b/libavcodec/x86/fmtconvert.asm
@@ -24,6 +24,52 @@
 
 SECTION_TEXT
 
+;---------------------------------------------------------------------------------
+; void int32_to_float_fmul_scalar(float *dst, const int *src, float mul, int len);
+;---------------------------------------------------------------------------------
+%macro INT32_TO_FLOAT_FMUL_SCALAR 2
+%ifdef ARCH_X86_64
+cglobal int32_to_float_fmul_scalar_%1, 3,3,%2, dst, src, len
+%else
+cglobal int32_to_float_fmul_scalar_%1, 4,4,%2, dst, src, mul, len
+    movss   m0, mulm
+%endif
+    SPLATD  m0
+    shl     lenq, 2
+    add     srcq, lenq
+    add     dstq, lenq
+    neg     lenq
+.loop:
+%ifidn %1, sse2
+    cvtdq2ps  m1, [srcq+lenq   ]
+    cvtdq2ps  m2, [srcq+lenq+16]
+%else
+    cvtpi2ps  m1, [srcq+lenq   ]
+    cvtpi2ps  m3, [srcq+lenq+ 8]
+    cvtpi2ps  m2, [srcq+lenq+16]
+    cvtpi2ps  m4, [srcq+lenq+24]
+    movlhps   m1, m3
+    movlhps   m2, m4
+%endif
+    mulps     m1, m0
+    mulps     m2, m0
+    mova  [dstq+lenq   ], m1
+    mova  [dstq+lenq+16], m2
+    add     lenq, 32
+    jl .loop
+    REP_RET
+%endmacro
+
+INIT_XMM
+%define SPLATD SPLATD_SSE
+%define movdqa movaps
+INT32_TO_FLOAT_FMUL_SCALAR sse, 5
+%undef movdqa
+%define SPLATD SPLATD_SSE2
+INT32_TO_FLOAT_FMUL_SCALAR sse2, 3
+%undef SPLATD
+
+
 ;------------------------------------------------------------------------------
 ; void ff_float_to_int16(int16_t *dst, const float *src, long len);
 ;------------------------------------------------------------------------------
diff --git a/libavcodec/x86/fmtconvert_mmx.c b/libavcodec/x86/fmtconvert_mmx.c
index 6e43280d66..86957b45ff 100644
--- a/libavcodec/x86/fmtconvert_mmx.c
+++ b/libavcodec/x86/fmtconvert_mmx.c
@@ -26,52 +26,11 @@
 #include "libavutil/x86_cpu.h"
 #include "libavcodec/fmtconvert.h"
 
-static void int32_to_float_fmul_scalar_sse(float *dst, const int *src, float mul, int len)
-{
-    x86_reg i = -4*len;
-    __asm__ volatile(
-        "movss  %3, %%xmm4 \n"
-        "shufps $0, %%xmm4, %%xmm4 \n"
-        "1: \n"
-        "cvtpi2ps   (%2,%0), %%xmm0 \n"
-        "cvtpi2ps  8(%2,%0), %%xmm1 \n"
-        "cvtpi2ps 16(%2,%0), %%xmm2 \n"
-        "cvtpi2ps 24(%2,%0), %%xmm3 \n"
-        "movlhps  %%xmm1,    %%xmm0 \n"
-        "movlhps  %%xmm3,    %%xmm2 \n"
-        "mulps    %%xmm4,    %%xmm0 \n"
-        "mulps    %%xmm4,    %%xmm2 \n"
-        "movaps   %%xmm0,   (%1,%0) \n"
-        "movaps   %%xmm2, 16(%1,%0) \n"
-        "add $32, %0 \n"
-        "jl 1b \n"
-        :"+r"(i)
-        :"r"(dst+len), "r"(src+len), "m"(mul)
-    );
-}
-
-static void int32_to_float_fmul_scalar_sse2(float *dst, const int *src, float mul, int len)
-{
-    x86_reg i = -4*len;
-    __asm__ volatile(
-        "movss  %3, %%xmm4 \n"
-        "shufps $0, %%xmm4, %%xmm4 \n"
-        "1: \n"
-        "cvtdq2ps   (%2,%0), %%xmm0 \n"
-        "cvtdq2ps 16(%2,%0), %%xmm1 \n"
-        "mulps    %%xmm4,    %%xmm0 \n"
-        "mulps    %%xmm4,    %%xmm1 \n"
-        "movaps   %%xmm0,   (%1,%0) \n"
-        "movaps   %%xmm1, 16(%1,%0) \n"
-        "add $32, %0 \n"
-        "jl 1b \n"
-        :"+r"(i)
-        :"r"(dst+len), "r"(src+len), "m"(mul)
-    );
-}
-
 #if HAVE_YASM
 
+void ff_int32_to_float_fmul_scalar_sse (float *dst, const int *src, float mul, int len);
+void ff_int32_to_float_fmul_scalar_sse2(float *dst, const int *src, float mul, int len);
+
 void ff_float_to_int16_3dnow(int16_t *dst, const float *src, long len);
 void ff_float_to_int16_sse  (int16_t *dst, const float *src, long len);
 void ff_float_to_int16_sse2 (int16_t *dst, const float *src, long len);
@@ -204,8 +163,8 @@ void ff_fmt_convert_init_x86(FmtConvertContext *c, AVCodecContext *avctx)
 {
     int mm_flags = av_get_cpu_flags();
 
-    if (mm_flags & AV_CPU_FLAG_MMX) {
 #if HAVE_YASM
+    if (mm_flags & AV_CPU_FLAG_MMX) {
         c->float_interleave = float_interleave_mmx;
 
         if (HAVE_AMD3DNOW && mm_flags & AV_CPU_FLAG_3DNOW) {
@@ -219,21 +178,17 @@ void ff_fmt_convert_init_x86(FmtConvertContext *c, AVCodecContext *avctx)
                 c->float_to_int16_interleave = float_to_int16_interleave_3dn2;
             }
         }
-#endif
         if (HAVE_SSE && mm_flags & AV_CPU_FLAG_SSE) {
-            c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse;
-#if HAVE_YASM
+            c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_sse;
             c->float_to_int16 = ff_float_to_int16_sse;
             c->float_to_int16_interleave = float_to_int16_interleave_sse;
             c->float_interleave = float_interleave_sse;
-#endif
         }
         if (HAVE_SSE && mm_flags & AV_CPU_FLAG_SSE2) {
-            c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse2;
-#if HAVE_YASM
+            c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_sse2;
             c->float_to_int16 = ff_float_to_int16_sse2;
             c->float_to_int16_interleave = float_to_int16_interleave_sse2;
-#endif
         }
     }
+#endif
 }
diff --git a/libavutil/x86/x86util.asm b/libavutil/x86/x86util.asm
index 7e16c15db2..874443a2ef 100644
--- a/libavutil/x86/x86util.asm
+++ b/libavutil/x86/x86util.asm
@@ -536,6 +536,18 @@
 %endif
 %endmacro
 
+%macro SPLATD_MMX 1
+    punpckldq  %1, %1
+%endmacro
+
+%macro SPLATD_SSE 1
+    shufps  %1, %1, 0
+%endmacro
+
+%macro SPLATD_SSE2 1
+    pshufd  %1, %1, 0
+%endmacro
+
 %macro CLIPW 3 ;(dst, min, max)
     pmaxsw %1, %2
     pminsw %1, %3

From aad3429d4e34b74e4eb0b37b17f32804e217cf02 Mon Sep 17 00:00:00 2001
From: Justin Ruggles <justin.ruggles@gmail.com>
Date: Mon, 10 Oct 2011 00:43:08 -0400
Subject: [PATCH 17/35] fmtconvert: port float_to_int16_interleave() 2-channel
 x86 inline asm to yasm

---
 libavcodec/x86/fmtconvert.asm   | 52 +++++++++++++++++++++++++
 libavcodec/x86/fmtconvert_mmx.c | 69 +++++----------------------------
 2 files changed, 61 insertions(+), 60 deletions(-)

diff --git a/libavcodec/x86/fmtconvert.asm b/libavcodec/x86/fmtconvert.asm
index e3eb5d2286..854954835c 100644
--- a/libavcodec/x86/fmtconvert.asm
+++ b/libavcodec/x86/fmtconvert.asm
@@ -112,6 +112,58 @@ FLOAT_TO_INT16 3dnow, 0
 %undef cvtps2pi
 
 
+;-------------------------------------------------------------------------------
+; void ff_float_to_int16_interleave2(int16_t *dst, const float **src, long len);
+;-------------------------------------------------------------------------------
+%macro FLOAT_TO_INT16_INTERLEAVE2 1
+cglobal float_to_int16_interleave2_%1, 3,4,2, dst, src0, src1, len
+    lea      lenq, [4*r2q]
+    mov     src1q, [src0q+gprsize]
+    mov     src0q, [src0q]
+    add      dstq, lenq
+    add     src0q, lenq
+    add     src1q, lenq
+    neg      lenq
+.loop:
+%ifidn %1, sse2
+    cvtps2dq   m0, [src0q+lenq]
+    cvtps2dq   m1, [src1q+lenq]
+    packssdw   m0, m1
+    movhlps    m1, m0
+    punpcklwd  m0, m1
+    mova  [dstq+lenq], m0
+%else
+    cvtps2pi   m0, [src0q+lenq  ]
+    cvtps2pi   m1, [src0q+lenq+8]
+    cvtps2pi   m2, [src1q+lenq  ]
+    cvtps2pi   m3, [src1q+lenq+8]
+    packssdw   m0, m1
+    packssdw   m2, m3
+    mova       m1, m0
+    punpcklwd  m0, m2
+    punpckhwd  m1, m2
+    mova  [dstq+lenq  ], m0
+    mova  [dstq+lenq+8], m1
+%endif
+    add      lenq, 16
+    js .loop
+%ifnidn %1, sse2
+    emms
+%endif
+    REP_RET
+%endmacro
+
+INIT_MMX
+%define cvtps2pi pf2id
+FLOAT_TO_INT16_INTERLEAVE2 3dnow
+%undef cvtps2pi
+%define movdqa movaps
+FLOAT_TO_INT16_INTERLEAVE2 sse
+%undef movdqa
+INIT_XMM
+FLOAT_TO_INT16_INTERLEAVE2 sse2
+
+
 %macro PSWAPD_SSE 2
     pshufw %1, %2, 0x4e
 %endmacro
diff --git a/libavcodec/x86/fmtconvert_mmx.c b/libavcodec/x86/fmtconvert_mmx.c
index 86957b45ff..17079d3c82 100644
--- a/libavcodec/x86/fmtconvert_mmx.c
+++ b/libavcodec/x86/fmtconvert_mmx.c
@@ -35,13 +35,17 @@ void ff_float_to_int16_3dnow(int16_t *dst, const float *src, long len);
 void ff_float_to_int16_sse  (int16_t *dst, const float *src, long len);
 void ff_float_to_int16_sse2 (int16_t *dst, const float *src, long len);
 
+void ff_float_to_int16_interleave2_3dnow(int16_t *dst, const float **src, long len);
+void ff_float_to_int16_interleave2_sse  (int16_t *dst, const float **src, long len);
+void ff_float_to_int16_interleave2_sse2 (int16_t *dst, const float **src, long len);
+
 void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len);
 void ff_float_to_int16_interleave6_3dnow(int16_t *dst, const float **src, int len);
 void ff_float_to_int16_interleave6_3dn2(int16_t *dst, const float **src, int len);
 
 #define ff_float_to_int16_interleave6_sse2 ff_float_to_int16_interleave6_sse
 
-#define FLOAT_TO_INT16_INTERLEAVE(cpu, body) \
+#define FLOAT_TO_INT16_INTERLEAVE(cpu) \
 /* gcc pessimizes register allocation if this is in the same function as float_to_int16_interleave_sse2*/\
 static av_noinline void float_to_int16_interleave_misc_##cpu(int16_t *dst, const float **src, long len, int channels){\
     DECLARE_ALIGNED(16, int16_t, tmp)[len];\
@@ -57,71 +61,16 @@ static void float_to_int16_interleave_##cpu(int16_t *dst, const float **src, lon
     if(channels==1)\
         ff_float_to_int16_##cpu(dst, src[0], len);\
     else if(channels==2){\
-        x86_reg reglen = len; \
-        const float *src0 = src[0];\
-        const float *src1 = src[1];\
-        __asm__ volatile(\
-            "shl $2, %0 \n"\
-            "add %0, %1 \n"\
-            "add %0, %2 \n"\
-            "add %0, %3 \n"\
-            "neg %0 \n"\
-            body\
-            :"+r"(reglen), "+r"(dst), "+r"(src0), "+r"(src1)\
-        );\
+        ff_float_to_int16_interleave2_##cpu(dst, src, len);\
     }else if(channels==6){\
         ff_float_to_int16_interleave6_##cpu(dst, src, len);\
     }else\
         float_to_int16_interleave_misc_##cpu(dst, src, len, channels);\
 }
 
-FLOAT_TO_INT16_INTERLEAVE(3dnow,
-    "1:                         \n"
-    "pf2id     (%2,%0), %%mm0   \n"
-    "pf2id    8(%2,%0), %%mm1   \n"
-    "pf2id     (%3,%0), %%mm2   \n"
-    "pf2id    8(%3,%0), %%mm3   \n"
-    "packssdw    %%mm1, %%mm0   \n"
-    "packssdw    %%mm3, %%mm2   \n"
-    "movq        %%mm0, %%mm1   \n"
-    "punpcklwd   %%mm2, %%mm0   \n"
-    "punpckhwd   %%mm2, %%mm1   \n"
-    "movq        %%mm0,  (%1,%0)\n"
-    "movq        %%mm1, 8(%1,%0)\n"
-    "add $16, %0                \n"
-    "js 1b                      \n"
-    "femms                      \n"
-)
-
-FLOAT_TO_INT16_INTERLEAVE(sse,
-    "1:                         \n"
-    "cvtps2pi  (%2,%0), %%mm0   \n"
-    "cvtps2pi 8(%2,%0), %%mm1   \n"
-    "cvtps2pi  (%3,%0), %%mm2   \n"
-    "cvtps2pi 8(%3,%0), %%mm3   \n"
-    "packssdw    %%mm1, %%mm0   \n"
-    "packssdw    %%mm3, %%mm2   \n"
-    "movq        %%mm0, %%mm1   \n"
-    "punpcklwd   %%mm2, %%mm0   \n"
-    "punpckhwd   %%mm2, %%mm1   \n"
-    "movq        %%mm0,  (%1,%0)\n"
-    "movq        %%mm1, 8(%1,%0)\n"
-    "add $16, %0                \n"
-    "js 1b                      \n"
-    "emms                       \n"
-)
-
-FLOAT_TO_INT16_INTERLEAVE(sse2,
-    "1:                         \n"
-    "cvtps2dq  (%2,%0), %%xmm0  \n"
-    "cvtps2dq  (%3,%0), %%xmm1  \n"
-    "packssdw   %%xmm1, %%xmm0  \n"
-    "movhlps    %%xmm0, %%xmm1  \n"
-    "punpcklwd  %%xmm1, %%xmm0  \n"
-    "movdqa     %%xmm0, (%1,%0) \n"
-    "add $16, %0                \n"
-    "js 1b                      \n"
-)
+FLOAT_TO_INT16_INTERLEAVE(3dnow)
+FLOAT_TO_INT16_INTERLEAVE(sse)
+FLOAT_TO_INT16_INTERLEAVE(sse2)
 
 static void float_to_int16_interleave_3dn2(int16_t *dst, const float **src, long len, int channels){
     if(channels==6)

From cb72230dfadb28651e036d717dc12d33b18a6893 Mon Sep 17 00:00:00 2001
From: Justin Ruggles <justin.ruggles@gmail.com>
Date: Sun, 25 Sep 2011 12:16:34 -0400
Subject: [PATCH 18/35] mp3on4: copy MPADSPContext from first context to all
 contexts.

Fixes segfault when decoding multi-channel MP3onMP4 files.
---
 libavcodec/mpegaudiodec.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/libavcodec/mpegaudiodec.c b/libavcodec/mpegaudiodec.c
index 2af05edc87..5d15d25e48 100644
--- a/libavcodec/mpegaudiodec.c
+++ b/libavcodec/mpegaudiodec.c
@@ -1959,6 +1959,7 @@ static int decode_init_mp3on4(AVCodecContext * avctx)
         s->mp3decctx[i] = av_mallocz(sizeof(MPADecodeContext));
         s->mp3decctx[i]->adu_mode = 1;
         s->mp3decctx[i]->avctx = avctx;
+        s->mp3decctx[i]->mpadsp = s->mp3decctx[0]->mpadsp;
     }
 
     return 0;

From f507dd067aec52b251f25e265cdb8b333db33b42 Mon Sep 17 00:00:00 2001
From: Justin Ruggles <justin.ruggles@gmail.com>
Date: Sun, 25 Sep 2011 12:30:16 -0400
Subject: [PATCH 19/35] mp3on4: allocate temp buffer with av_malloc() instead
 of on the stack.

Avoids allocating unnecessary memory and ensures proper alignment.
---
 libavcodec/mpegaudiodec.c | 45 +++++++++++++++++++++++++--------------
 1 file changed, 29 insertions(+), 16 deletions(-)

diff --git a/libavcodec/mpegaudiodec.c b/libavcodec/mpegaudiodec.c
index 5d15d25e48..3bd7b02b9c 100644
--- a/libavcodec/mpegaudiodec.c
+++ b/libavcodec/mpegaudiodec.c
@@ -1894,6 +1894,7 @@ typedef struct MP3On4DecodeContext {
     int syncword; ///< syncword patch
     const uint8_t *coff; ///< channels offsets in output buffer
     MPADecodeContext *mp3decctx[5]; ///< MPADecodeContext for every decoder instance
+    OUT_INT *decoded_buf;           ///< output buffer for decoded samples
 } MP3On4DecodeContext;
 
 #include "mpeg4audio.h"
@@ -1913,6 +1914,20 @@ static const uint8_t chan_offset[8][5] = {
 };
 
 
+static av_cold int decode_close_mp3on4(AVCodecContext * avctx)
+{
+    MP3On4DecodeContext *s = avctx->priv_data;
+    int i;
+
+    for (i = 0; i < s->frames; i++)
+        av_free(s->mp3decctx[i]);
+
+    av_freep(&s->decoded_buf);
+
+    return 0;
+}
+
+
 static int decode_init_mp3on4(AVCodecContext * avctx)
 {
     MP3On4DecodeContext *s = avctx->priv_data;
@@ -1962,19 +1977,18 @@ static int decode_init_mp3on4(AVCodecContext * avctx)
         s->mp3decctx[i]->mpadsp = s->mp3decctx[0]->mpadsp;
     }
 
-    return 0;
-}
-
-
-static av_cold int decode_close_mp3on4(AVCodecContext * avctx)
-{
-    MP3On4DecodeContext *s = avctx->priv_data;
-    int i;
-
-    for (i = 0; i < s->frames; i++)
-        av_free(s->mp3decctx[i]);
+    /* Allocate buffer for multi-channel output if needed */
+    if (s->frames > 1) {
+        s->decoded_buf = av_malloc(MPA_FRAME_SIZE * MPA_MAX_CHANNELS *
+                                   sizeof(*s->decoded_buf));
+        if (!s->decoded_buf)
+            goto alloc_fail;
+    }
 
     return 0;
+alloc_fail:
+    decode_close_mp3on4(avctx);
+    return AVERROR(ENOMEM);
 }
 
 
@@ -1989,7 +2003,6 @@ static int decode_frame_mp3on4(AVCodecContext * avctx,
     int fsize, len = buf_size, out_size = 0;
     uint32_t header;
     OUT_INT *out_samples = data;
-    OUT_INT decoded_buf[MPA_FRAME_SIZE * MPA_MAX_CHANNELS];
     OUT_INT *outptr, *bp;
     int fr, j, n;
 
@@ -2002,7 +2015,7 @@ static int decode_frame_mp3on4(AVCodecContext * avctx,
         return -1;
 
     // If only one decoder interleave is not needed
-    outptr = s->frames == 1 ? out_samples : decoded_buf;
+    outptr = s->frames == 1 ? out_samples : s->decoded_buf;
 
     avctx->bit_rate = 0;
 
@@ -2028,13 +2041,13 @@ static int decode_frame_mp3on4(AVCodecContext * avctx,
             bp = out_samples + s->coff[fr];
             if(m->nb_channels == 1) {
                 for(j = 0; j < n; j++) {
-                    *bp = decoded_buf[j];
+                    *bp = s->decoded_buf[j];
                     bp += avctx->channels;
                 }
             } else {
                 for(j = 0; j < n; j++) {
-                    bp[0] = decoded_buf[j++];
-                    bp[1] = decoded_buf[j];
+                    bp[0] = s->decoded_buf[j++];
+                    bp[1] = s->decoded_buf[j];
                     bp += avctx->channels;
                 }
             }

From fff0f831e0c8ccf87a6374f4bb349ac668bce14e Mon Sep 17 00:00:00 2001
From: Justin Ruggles <justin.ruggles@gmail.com>
Date: Sun, 25 Sep 2011 12:46:54 -0400
Subject: [PATCH 20/35] mp3on4: fix the output channel order

---
 libavcodec/mpegaudiodec.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/libavcodec/mpegaudiodec.c b/libavcodec/mpegaudiodec.c
index 3bd7b02b9c..57dadedca1 100644
--- a/libavcodec/mpegaudiodec.c
+++ b/libavcodec/mpegaudiodec.c
@@ -1901,16 +1901,16 @@ typedef struct MP3On4DecodeContext {
 
 /* Next 3 arrays are indexed by channel config number (passed via codecdata) */
 static const uint8_t mp3Frames[8] = {0,1,1,2,3,3,4,5};   /* number of mp3 decoder instances */
-/* offsets into output buffer, assume output order is FL FR BL BR C LFE */
+/* offsets into output buffer, assume output order is FL FR C LFE BL BR SL SR */
 static const uint8_t chan_offset[8][5] = {
     {0},
     {0},            // C
     {0},            // FLR
     {2,0},          // C FLR
     {2,0,3},        // C FLR BS
-    {4,0,2},        // C FLR BLRS
-    {4,0,2,5},      // C FLR BLRS LFE
-    {4,0,2,6,5},    // C FLR BLRS BLR LFE
+    {2,0,3},        // C FLR BLRS
+    {2,0,4,3},      // C FLR BLRS LFE
+    {2,0,6,4,3},    // C FLR BLRS BLR LFE
 };
 
 

From 1183d6cd98da7d1e9f751a68d288b200240ed335 Mon Sep 17 00:00:00 2001
From: Justin Ruggles <justin.ruggles@gmail.com>
Date: Sun, 25 Sep 2011 12:52:11 -0400
Subject: [PATCH 21/35] mp3on4: set channel layout

---
 libavcodec/mpegaudiodec.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/libavcodec/mpegaudiodec.c b/libavcodec/mpegaudiodec.c
index 57dadedca1..f2728585f7 100644
--- a/libavcodec/mpegaudiodec.c
+++ b/libavcodec/mpegaudiodec.c
@@ -1913,6 +1913,17 @@ static const uint8_t chan_offset[8][5] = {
     {2,0,6,4,3},    // C FLR BLRS BLR LFE
 };
 
+/* mp3on4 channel layouts */
+static const int16_t chan_layout[8] = {
+    0,
+    AV_CH_LAYOUT_MONO,
+    AV_CH_LAYOUT_STEREO,
+    AV_CH_LAYOUT_SURROUND,
+    AV_CH_LAYOUT_4POINT0,
+    AV_CH_LAYOUT_5POINT0,
+    AV_CH_LAYOUT_5POINT1,
+    AV_CH_LAYOUT_7POINT1
+};
 
 static av_cold int decode_close_mp3on4(AVCodecContext * avctx)
 {
@@ -1947,6 +1958,7 @@ static int decode_init_mp3on4(AVCodecContext * avctx)
     s->frames = mp3Frames[cfg.chan_config];
     s->coff = chan_offset[cfg.chan_config];
     avctx->channels = ff_mpeg4audio_channels[cfg.chan_config];
+    avctx->channel_layout = chan_layout[cfg.chan_config];
 
     if (cfg.sample_rate < 16000)
         s->syncword = 0xffe00000;

From 53c8443ad2376a50c76e5d7c69435bd01b0abc42 Mon Sep 17 00:00:00 2001
From: Justin Ruggles <justin.ruggles@gmail.com>
Date: Sun, 25 Sep 2011 13:04:39 -0400
Subject: [PATCH 22/35] mp3on4: ensure that the frame channel count does not
 exceed the codec channel count.

This also allows for checking output data size based on the actual
number of channel instead of the maximum number of channels.
---
 libavcodec/mpegaudiodec.c | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/libavcodec/mpegaudiodec.c b/libavcodec/mpegaudiodec.c
index f2728585f7..c3c6ee3805 100644
--- a/libavcodec/mpegaudiodec.c
+++ b/libavcodec/mpegaudiodec.c
@@ -2016,10 +2016,12 @@ static int decode_frame_mp3on4(AVCodecContext * avctx,
     uint32_t header;
     OUT_INT *out_samples = data;
     OUT_INT *outptr, *bp;
-    int fr, j, n;
+    int fr, j, n, ch;
 
-    if(*data_size < MPA_FRAME_SIZE * MPA_MAX_CHANNELS * s->frames * sizeof(OUT_INT))
-        return -1;
+    if (*data_size < MPA_FRAME_SIZE * avctx->channels * sizeof(OUT_INT)) {
+        av_log(avctx, AV_LOG_ERROR, "output buffer is too small\n");
+        return AVERROR(EINVAL);
+    }
 
     *data_size = 0;
     // Discard too short frames
@@ -2031,6 +2033,7 @@ static int decode_frame_mp3on4(AVCodecContext * avctx,
 
     avctx->bit_rate = 0;
 
+    ch = 0;
     for (fr = 0; fr < s->frames; fr++) {
         fsize = AV_RB16(buf) >> 4;
         fsize = FFMIN3(fsize, len, MPA_MAX_CODED_FRAME_SIZE);
@@ -2043,6 +2046,14 @@ static int decode_frame_mp3on4(AVCodecContext * avctx,
             break;
 
         avpriv_mpegaudio_decode_header((MPADecodeHeader *)m, header);
+
+        if (ch + m->nb_channels > avctx->channels) {
+            av_log(avctx, AV_LOG_ERROR, "frame channel count exceeds codec "
+                                        "channel count\n");
+            return AVERROR_INVALIDDATA;
+        }
+        ch += m->nb_channels;
+
         out_size += mp_decode_frame(m, outptr, buf, fsize);
         buf += fsize;
         len -= fsize;

From 180bf988bc524f4775dd4765f07816df324e808b Mon Sep 17 00:00:00 2001
From: Justin Ruggles <justin.ruggles@gmail.com>
Date: Sun, 25 Sep 2011 13:39:04 -0400
Subject: [PATCH 23/35] mp3on4: create a separate flush function for MP3onMP4.

The correct decoder private context needs to be used.
This fixes mp3on4 playback and seeking in avplay.
---
 libavcodec/mpegaudiodec.c       | 15 ++++++++++++++-
 libavcodec/mpegaudiodec_float.c |  2 +-
 2 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/libavcodec/mpegaudiodec.c b/libavcodec/mpegaudiodec.c
index c3c6ee3805..040b1090d6 100644
--- a/libavcodec/mpegaudiodec.c
+++ b/libavcodec/mpegaudiodec.c
@@ -2004,6 +2004,19 @@ alloc_fail:
 }
 
 
+static void flush_mp3on4(AVCodecContext *avctx)
+{
+    int i;
+    MP3On4DecodeContext *s = avctx->priv_data;
+
+    for (i = 0; i < s->frames; i++) {
+        MPADecodeContext *m = s->mp3decctx[i];
+        memset(m->synth_buf, 0, sizeof(m->synth_buf));
+        m->last_buf_size = 0;
+    }
+}
+
+
 static int decode_frame_mp3on4(AVCodecContext * avctx,
                         void *data, int *data_size,
                         AVPacket *avpkt)
@@ -2148,7 +2161,7 @@ AVCodec ff_mp3on4_decoder = {
     .init           = decode_init_mp3on4,
     .close          = decode_close_mp3on4,
     .decode         = decode_frame_mp3on4,
-    .flush          = flush,
+    .flush          = flush_mp3on4,
     .long_name      = NULL_IF_CONFIG_SMALL("MP3onMP4"),
 };
 #endif
diff --git a/libavcodec/mpegaudiodec_float.c b/libavcodec/mpegaudiodec_float.c
index 929d72738b..7f512500b3 100644
--- a/libavcodec/mpegaudiodec_float.c
+++ b/libavcodec/mpegaudiodec_float.c
@@ -83,7 +83,7 @@ AVCodec ff_mp3on4float_decoder = {
     .init           = decode_init_mp3on4,
     .close          = decode_close_mp3on4,
     .decode         = decode_frame_mp3on4,
-    .flush          = flush,
+    .flush          = flush_mp3on4,
     .long_name      = NULL_IF_CONFIG_SMALL("MP3onMP4"),
 };
 #endif

From 95891804bf300b266aa5328f1c338c046720e658 Mon Sep 17 00:00:00 2001
From: Justin Ruggles <justin.ruggles@gmail.com>
Date: Sun, 25 Sep 2011 14:32:42 -0400
Subject: [PATCH 24/35] mp3on4: check for allocation failures in
 decode_init_mp3on4()

---
 libavcodec/mpegaudiodec.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/libavcodec/mpegaudiodec.c b/libavcodec/mpegaudiodec.c
index 040b1090d6..2b357b5412 100644
--- a/libavcodec/mpegaudiodec.c
+++ b/libavcodec/mpegaudiodec.c
@@ -1972,6 +1972,8 @@ static int decode_init_mp3on4(AVCodecContext * avctx)
      */
     // Allocate zeroed memory for the first decoder context
     s->mp3decctx[0] = av_mallocz(sizeof(MPADecodeContext));
+    if (!s->mp3decctx[0])
+        goto alloc_fail;
     // Put decoder context in place to make init_decode() happy
     avctx->priv_data = s->mp3decctx[0];
     decode_init(avctx);
@@ -1984,6 +1986,8 @@ static int decode_init_mp3on4(AVCodecContext * avctx)
      */
     for (i = 1; i < s->frames; i++) {
         s->mp3decctx[i] = av_mallocz(sizeof(MPADecodeContext));
+        if (!s->mp3decctx[i])
+            goto alloc_fail;
         s->mp3decctx[i]->adu_mode = 1;
         s->mp3decctx[i]->avctx = avctx;
         s->mp3decctx[i]->mpadsp = s->mp3decctx[0]->mpadsp;

From 94395fbf8c85d6139115d8b20cce19476ed87806 Mon Sep 17 00:00:00 2001
From: Raivo Hool <raivo.hool@gmail.com>
Date: Fri, 21 Oct 2011 16:51:33 +0300
Subject: [PATCH 25/35] mov: parse the gnre atom

Signed-off-by: Anton Khirnov <anton@khirnov.net>
---
 libavformat/mov.c | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/libavformat/mov.c b/libavformat/mov.c
index 6baddebb82..3c551c63aa 100644
--- a/libavformat/mov.c
+++ b/libavformat/mov.c
@@ -35,6 +35,7 @@
 #include "riff.h"
 #include "isom.h"
 #include "libavcodec/get_bits.h"
+#include "id3v1.h"
 
 #if CONFIG_ZLIB
 #include <zlib.h>
@@ -126,6 +127,23 @@ static int mov_metadata_stik(MOVContext *c, AVIOContext *pb,
   return 0;
 }
 
+static int mov_metadata_gnre(MOVContext *c, AVIOContext *pb,
+                             unsigned len, const char *key)
+{
+    short genre;
+    char buf[20];
+
+    avio_r8(pb); // unknown
+
+    genre = avio_r8(pb);
+    if (genre < 1 || genre > ID3v1_GENRE_MAX)
+        return 0;
+    snprintf(buf, sizeof(buf), "%s", ff_id3v1_genre_str[genre-1]);
+    av_dict_set(&c->fc->metadata, key, buf, 0);
+
+    return 0;
+}
+
 static const uint32_t mac_to_unicode[128] = {
     0x00C4,0x00C5,0x00C7,0x00C9,0x00D1,0x00D6,0x00DC,0x00E1,
     0x00E0,0x00E2,0x00E4,0x00E3,0x00E5,0x00E7,0x00E9,0x00E8,
@@ -187,6 +205,8 @@ static int mov_read_udta_string(MOVContext *c, AVIOContext *pb, MOVAtom atom)
     case MKTAG(0xa9,'a','l','b'): key = "album";     break;
     case MKTAG(0xa9,'d','a','y'): key = "date";      break;
     case MKTAG(0xa9,'g','e','n'): key = "genre";     break;
+    case MKTAG( 'g','n','r','e'): key = "genre";
+        parse = mov_metadata_gnre; break;
     case MKTAG(0xa9,'t','o','o'):
     case MKTAG(0xa9,'s','w','r'): key = "encoder";   break;
     case MKTAG(0xa9,'e','n','c'): key = "encoder";   break;

From 80951f5cf6cd34b317857c710d8cce09b0b73c4f Mon Sep 17 00:00:00 2001
From: Raivo Hool <raivo.hool@gmail.com>
Date: Fri, 21 Oct 2011 16:04:13 +0300
Subject: [PATCH 26/35] mov: rename function _int8 to remove ambiguity, some
 indentation cosmetics

Signed-off-by: Anton Khirnov <anton@khirnov.net>
---
 libavformat/mov.c | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/libavformat/mov.c b/libavformat/mov.c
index 3c551c63aa..84bd4ed9c8 100644
--- a/libavformat/mov.c
+++ b/libavformat/mov.c
@@ -100,20 +100,20 @@ static int mov_metadata_track_or_disc_number(MOVContext *c, AVIOContext *pb,
     return 0;
 }
 
-static int mov_metadata_int8(MOVContext *c, AVIOContext *pb,
-                             unsigned len, const char *key)
+static int mov_metadata_int8_bypass_padding(MOVContext *c, AVIOContext *pb,
+                                            unsigned len, const char *key)
 {
-  char buf[16];
+    char buf[16];
 
-  /* bypass padding bytes */
-  avio_r8(pb);
-  avio_r8(pb);
-  avio_r8(pb);
+    /* bypass padding bytes */
+    avio_r8(pb);
+    avio_r8(pb);
+    avio_r8(pb);
 
-  snprintf(buf, sizeof(buf), "%hu", avio_r8(pb));
-  av_dict_set(&c->fc->metadata, key, buf, 0);
+    snprintf(buf, sizeof(buf), "%hu", avio_r8(pb));
+    av_dict_set(&c->fc->metadata, key, buf, 0);
 
-  return 0;
+    return 0;
 }
 
 static int mov_metadata_stik(MOVContext *c, AVIOContext *pb,
@@ -220,9 +220,9 @@ static int mov_read_udta_string(MOVContext *c, AVIOContext *pb, MOVAtom atom)
     case MKTAG( 'd','i','s','k'): key = "disc";
         parse = mov_metadata_track_or_disc_number; break;
     case MKTAG( 't','v','e','s'): key = "episode_sort";
-        parse = mov_metadata_int8; break;
+        parse = mov_metadata_int8_bypass_padding; break;
     case MKTAG( 't','v','s','n'): key = "season_number";
-        parse = mov_metadata_int8; break;
+        parse = mov_metadata_int8_bypass_padding; break;
     case MKTAG( 's','t','i','k'): key = "media_type";
         parse = mov_metadata_stik; break;
     }

From 5da35d1cb37fbaf0c6233955ec1934216d75a3bc Mon Sep 17 00:00:00 2001
From: Raivo Hool <raivo.hool@gmail.com>
Date: Fri, 21 Oct 2011 16:04:14 +0300
Subject: [PATCH 27/35] mov: rename function _stik, some indentation cosmetics

Signed-off-by: Anton Khirnov <anton@khirnov.net>
---
 libavformat/mov.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/libavformat/mov.c b/libavformat/mov.c
index 84bd4ed9c8..3a00b4679f 100644
--- a/libavformat/mov.c
+++ b/libavformat/mov.c
@@ -116,15 +116,15 @@ static int mov_metadata_int8_bypass_padding(MOVContext *c, AVIOContext *pb,
     return 0;
 }
 
-static int mov_metadata_stik(MOVContext *c, AVIOContext *pb,
-                             unsigned len, const char *key)
+static int mov_metadata_int8_no_padding(MOVContext *c, AVIOContext *pb,
+                                        unsigned len, const char *key)
 {
-  char buf[16];
+    char buf[16];
 
-  snprintf(buf, sizeof(buf), "%hu", avio_r8(pb));
-  av_dict_set(&c->fc->metadata, key, buf, 0);
+    snprintf(buf, sizeof(buf), "%hu", avio_r8(pb));
+    av_dict_set(&c->fc->metadata, key, buf, 0);
 
-  return 0;
+    return 0;
 }
 
 static int mov_metadata_gnre(MOVContext *c, AVIOContext *pb,
@@ -224,7 +224,7 @@ static int mov_read_udta_string(MOVContext *c, AVIOContext *pb, MOVAtom atom)
     case MKTAG( 't','v','s','n'): key = "season_number";
         parse = mov_metadata_int8_bypass_padding; break;
     case MKTAG( 's','t','i','k'): key = "media_type";
-        parse = mov_metadata_stik; break;
+        parse = mov_metadata_int8_no_padding; break;
     }
 
     if (c->itunes_metadata && atom.size > 8) {

From b06df7075590b7954900a9ef5e2dd0e7e832544c Mon Sep 17 00:00:00 2001
From: Raivo Hool <raivo.hool@gmail.com>
Date: Fri, 21 Oct 2011 16:04:15 +0300
Subject: [PATCH 28/35] mov: add support for hdvd and pgapmetadata atoms

Signed-off-by: Anton Khirnov <anton@khirnov.net>
---
 libavformat/mov.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/libavformat/mov.c b/libavformat/mov.c
index 3a00b4679f..1747bd41a2 100644
--- a/libavformat/mov.c
+++ b/libavformat/mov.c
@@ -225,6 +225,10 @@ static int mov_read_udta_string(MOVContext *c, AVIOContext *pb, MOVAtom atom)
         parse = mov_metadata_int8_bypass_padding; break;
     case MKTAG( 's','t','i','k'): key = "media_type";
         parse = mov_metadata_int8_no_padding; break;
+    case MKTAG( 'h','d','v','d'): key = "hd_video";
+        parse = mov_metadata_int8_no_padding; break;
+    case MKTAG( 'p','g','a','p'): key = "gapless_playback";
+        parse = mov_metadata_int8_no_padding; break;
     }
 
     if (c->itunes_metadata && atom.size > 8) {

From 5dd35b43f1cd3dddaddaae8e2f267117b5fa2d54 Mon Sep 17 00:00:00 2001
From: Anton Khirnov <anton@khirnov.net>
Date: Fri, 21 Oct 2011 11:47:39 +0200
Subject: [PATCH 29/35] Move timefilter code from lavf to lavd.

It's only used in the JACK device.

Fixes linking shared lavd with JACK enabled.
---
 libavdevice/Makefile                      | 4 +++-
 libavdevice/jack_audio.c                  | 2 +-
 {libavformat => libavdevice}/timefilter.c | 2 +-
 {libavformat => libavdevice}/timefilter.h | 6 +++---
 libavformat/Makefile                      | 5 +----
 5 files changed, 9 insertions(+), 10 deletions(-)
 rename {libavformat => libavdevice}/timefilter.c (99%)
 rename {libavformat => libavdevice}/timefilter.h (97%)

diff --git a/libavdevice/Makefile b/libavdevice/Makefile
index d8a5945549..1f2a6efceb 100644
--- a/libavdevice/Makefile
+++ b/libavdevice/Makefile
@@ -13,7 +13,7 @@ OBJS-$(CONFIG_ALSA_OUTDEV)               += alsa-audio-common.o \
 OBJS-$(CONFIG_BKTR_INDEV)                += bktr.o
 OBJS-$(CONFIG_DV1394_INDEV)              += dv1394.o
 OBJS-$(CONFIG_FBDEV_INDEV)               += fbdev.o
-OBJS-$(CONFIG_JACK_INDEV)                += jack_audio.o
+OBJS-$(CONFIG_JACK_INDEV)                += jack_audio.o timefilter.o
 OBJS-$(CONFIG_OSS_INDEV)                 += oss_audio.o
 OBJS-$(CONFIG_OSS_OUTDEV)                += oss_audio.o
 OBJS-$(CONFIG_SNDIO_INDEV)               += sndio_common.o sndio_dec.o
@@ -30,4 +30,6 @@ OBJS-$(CONFIG_LIBDC1394_INDEV)           += libdc1394.o
 SKIPHEADERS-$(HAVE_ALSA_ASOUNDLIB_H)     += alsa-audio.h
 SKIPHEADERS-$(HAVE_SNDIO_H)              += sndio_common.h
 
+TESTPROGS = timefilter
+
 include $(SRC_PATH)/subdir.mak
diff --git a/libavdevice/jack_audio.c b/libavdevice/jack_audio.c
index 4907e82395..f75c176be9 100644
--- a/libavdevice/jack_audio.c
+++ b/libavdevice/jack_audio.c
@@ -29,7 +29,7 @@
 #include "libavutil/opt.h"
 #include "libavcodec/avcodec.h"
 #include "libavformat/avformat.h"
-#include "libavformat/timefilter.h"
+#include "timefilter.h"
 
 /**
  * Size of the internal FIFO buffers as a number of audio packets
diff --git a/libavformat/timefilter.c b/libavdevice/timefilter.c
similarity index 99%
rename from libavformat/timefilter.c
rename to libavdevice/timefilter.c
index 4860a4ff70..332d33b5e8 100644
--- a/libavformat/timefilter.c
+++ b/libavdevice/timefilter.c
@@ -24,8 +24,8 @@
 
 
 #include "config.h"
-#include "avformat.h"
 #include "timefilter.h"
+#include "libavutil/mem.h"
 
 struct TimeFilter {
     /// Delay Locked Loop data. These variables refer to mathematical
diff --git a/libavformat/timefilter.h b/libavdevice/timefilter.h
similarity index 97%
rename from libavformat/timefilter.h
rename to libavdevice/timefilter.h
index aa7db533b4..c98fd03bba 100644
--- a/libavformat/timefilter.h
+++ b/libavdevice/timefilter.h
@@ -22,8 +22,8 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#ifndef AVFORMAT_TIMEFILTER_H
-#define AVFORMAT_TIMEFILTER_H
+#ifndef AVDEVICE_TIMEFILTER_H
+#define AVDEVICE_TIMEFILTER_H
 
 /**
  * Opaque type representing a time filter state
@@ -94,4 +94,4 @@ void ff_timefilter_reset(TimeFilter *);
  */
 void ff_timefilter_destroy(TimeFilter *);
 
-#endif /* AVFORMAT_TIMEFILTER_H */
+#endif /* AVDEVICE_TIMEFILTER_H */
diff --git a/libavformat/Makefile b/libavformat/Makefile
index 0a30c6ec1a..6973b15995 100644
--- a/libavformat/Makefile
+++ b/libavformat/Makefile
@@ -334,11 +334,8 @@ OBJS-$(CONFIG_RTP_PROTOCOL)              += rtpproto.o
 OBJS-$(CONFIG_TCP_PROTOCOL)              += tcp.o
 OBJS-$(CONFIG_UDP_PROTOCOL)              += udp.o
 
-# libavdevice dependencies
-OBJS-$(CONFIG_JACK_INDEV)                += timefilter.o
-
 EXAMPLES  = metadata output
-TESTPROGS = seek timefilter
+TESTPROGS = seek
 TOOLS     = pktdumper probetest
 
 include $(SRC_PATH)/subdir.mak

From 41ac093f7e315e3af17612f580c387b3688f4f43 Mon Sep 17 00:00:00 2001
From: Mans Rullgard <mans@mansr.com>
Date: Fri, 21 Oct 2011 20:36:11 +0100
Subject: [PATCH 30/35] swscale: fix signed shift overflows in
 ff_yuv2rgb_c_init_tables()

Signed-off-by: Mans Rullgard <mans@mansr.com>
---
 libswscale/yuv2rgb.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libswscale/yuv2rgb.c b/libswscale/yuv2rgb.c
index cad09338d3..39c8b9c6fb 100644
--- a/libswscale/yuv2rgb.c
+++ b/libswscale/yuv2rgb.c
@@ -788,8 +788,8 @@ av_cold int ff_yuv2rgb_c_init_tables(SwsContext *c, const int inv_table[4], int
         y_table32 = c->yuvTable;
         yb = -(384<<16) - oy;
         for (i = 0; i < 1024; i++) {
-            uint8_t yval = av_clip_uint8((yb + 0x8000) >> 16);
-            y_table32[i     ] = (yval << rbase) + (needAlpha ? 0 : (255 << abase));
+            unsigned yval = av_clip_uint8((yb + 0x8000) >> 16);
+            y_table32[i     ] = (yval << rbase) + (needAlpha ? 0 : (255u << abase));
             y_table32[i+1024] = yval << gbase;
             y_table32[i+2048] = yval << bbase;
             yb += cy;

From 7eeaa6796b647f8ef46c234e4f80a7dd5591ee71 Mon Sep 17 00:00:00 2001
From: Justin Ruggles <justin.ruggles@gmail.com>
Date: Fri, 21 Oct 2011 12:07:42 -0400
Subject: [PATCH 31/35] libspeexdec: decode one frame at a time.

This allows for knowing the output size before decoding even when there is no
header (e.g. FLV). Otherwise we would have to do a preliminary full frame
decode to determine the number of frames-per-packet.
---
 libavcodec/libspeexdec.c | 54 ++++++++++++++++++++++++++--------------
 1 file changed, 36 insertions(+), 18 deletions(-)

diff --git a/libavcodec/libspeexdec.c b/libavcodec/libspeexdec.c
index cda987ca6a..fc90308065 100644
--- a/libavcodec/libspeexdec.c
+++ b/libavcodec/libspeexdec.c
@@ -99,32 +99,42 @@ static int libspeex_decode_frame(AVCodecContext *avctx,
     uint8_t *buf = avpkt->data;
     int buf_size = avpkt->size;
     LibSpeexContext *s = avctx->priv_data;
-    int16_t *output = data, *end;
-    int i, num_samples;
-
-    num_samples = s->frame_size * avctx->channels;
-    end = output + *data_size / sizeof(*output);
+    int16_t *output = data;
+    int out_size, ret, consumed = 0;
+
+    /* check output buffer size */
+    out_size = s->frame_size * avctx->channels *
+               av_get_bytes_per_sample(avctx->sample_fmt);
+    if (*data_size < out_size) {
+        av_log(avctx, AV_LOG_ERROR, "Output buffer is too small\n");
+        return AVERROR(EINVAL);
+    }
 
-    speex_bits_read_from(&s->bits, buf, buf_size);
+    /* if there is not enough data left for the smallest possible frame,
+       reset the libspeex buffer using the current packet, otherwise ignore
+       the current packet and keep decoding frames from the libspeex buffer. */
+    if (speex_bits_remaining(&s->bits) < 43) {
+        /* check for flush packet */
+        if (!buf || !buf_size) {
+            *data_size = 0;
+            return buf_size;
+        }
+        /* set new buffer */
+        speex_bits_read_from(&s->bits, buf, buf_size);
+        consumed = buf_size;
+    }
 
-    for (i = 0; speex_bits_remaining(&s->bits) && output + num_samples < end; i++) {
-        int ret = speex_decode_int(s->dec_state, &s->bits, output);
+    /* decode a single frame */
+        ret = speex_decode_int(s->dec_state, &s->bits, output);
         if (ret <= -2) {
             av_log(avctx, AV_LOG_ERROR, "Error decoding Speex frame.\n");
             return -1;
-        } else if (ret == -1)
-            // end of stream
-            break;
-
+        }
         if (avctx->channels == 2)
             speex_decode_stereo_int(output, s->frame_size, &s->stereo);
 
-        output += num_samples;
-    }
-
-    avctx->frame_size = s->frame_size * i;
-    *data_size = avctx->channels * avctx->frame_size * sizeof(*output);
-    return buf_size;
+    *data_size = out_size;
+    return consumed;
 }
 
 static av_cold int libspeex_decode_close(AVCodecContext *avctx)
@@ -138,6 +148,12 @@ static av_cold int libspeex_decode_close(AVCodecContext *avctx)
     return 0;
 }
 
+static av_cold void libspeex_decode_flush(AVCodecContext *avctx)
+{
+    LibSpeexContext *s = avctx->priv_data;
+    speex_bits_reset(&s->bits);
+}
+
 AVCodec ff_libspeex_decoder = {
     .name           = "libspeex",
     .type           = AVMEDIA_TYPE_AUDIO,
@@ -146,5 +162,7 @@ AVCodec ff_libspeex_decoder = {
     .init           = libspeex_decode_init,
     .close          = libspeex_decode_close,
     .decode         = libspeex_decode_frame,
+    .flush          = libspeex_decode_flush,
+    .capabilities   = CODEC_CAP_SUBFRAMES | CODEC_CAP_DELAY,
     .long_name = NULL_IF_CONFIG_SMALL("libspeex Speex"),
 };

From 14bc60dbaeb10cb95bd47902067984de88e0315e Mon Sep 17 00:00:00 2001
From: Justin Ruggles <justin.ruggles@gmail.com>
Date: Fri, 21 Oct 2011 12:10:35 -0400
Subject: [PATCH 32/35] libspeexdec: cosmetics: reindent

---
 libavcodec/libspeexdec.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/libavcodec/libspeexdec.c b/libavcodec/libspeexdec.c
index fc90308065..69742297f0 100644
--- a/libavcodec/libspeexdec.c
+++ b/libavcodec/libspeexdec.c
@@ -125,13 +125,13 @@ static int libspeex_decode_frame(AVCodecContext *avctx,
     }
 
     /* decode a single frame */
-        ret = speex_decode_int(s->dec_state, &s->bits, output);
-        if (ret <= -2) {
-            av_log(avctx, AV_LOG_ERROR, "Error decoding Speex frame.\n");
-            return -1;
-        }
-        if (avctx->channels == 2)
-            speex_decode_stereo_int(output, s->frame_size, &s->stereo);
+    ret = speex_decode_int(s->dec_state, &s->bits, output);
+    if (ret <= -2) {
+        av_log(avctx, AV_LOG_ERROR, "Error decoding Speex frame.\n");
+        return -1;
+    }
+    if (avctx->channels == 2)
+        speex_decode_stereo_int(output, s->frame_size, &s->stereo);
 
     *data_size = out_size;
     return consumed;

From a470fe80ba21513c29e319d968f87f1379a97d16 Mon Sep 17 00:00:00 2001
From: Justin Ruggles <justin.ruggles@gmail.com>
Date: Fri, 21 Oct 2011 12:13:04 -0400
Subject: [PATCH 33/35] libspeexdec: return meaningful error codes

---
 libavcodec/libspeexdec.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/libavcodec/libspeexdec.c b/libavcodec/libspeexdec.c
index 69742297f0..f66331ea93 100644
--- a/libavcodec/libspeexdec.c
+++ b/libavcodec/libspeexdec.c
@@ -60,14 +60,14 @@ static av_cold int libspeex_decode_init(AVCodecContext *avctx)
         mode = speex_lib_get_mode(s->header->mode);
         if (!mode) {
             av_log(avctx, AV_LOG_ERROR, "Unknown Speex mode %d", s->header->mode);
-            return -1;
+            return AVERROR_INVALIDDATA;
         }
     } else
         av_log(avctx, AV_LOG_INFO, "Missing Speex header, assuming defaults.\n");
 
     if (avctx->channels > 2) {
         av_log(avctx, AV_LOG_ERROR, "Only stereo and mono are supported.\n");
-        return -1;
+        return AVERROR(EINVAL);
     }
 
     speex_bits_init(&s->bits);
@@ -128,7 +128,7 @@ static int libspeex_decode_frame(AVCodecContext *avctx,
     ret = speex_decode_int(s->dec_state, &s->bits, output);
     if (ret <= -2) {
         av_log(avctx, AV_LOG_ERROR, "Error decoding Speex frame.\n");
-        return -1;
+        return AVERROR_INVALIDDATA;
     }
     if (avctx->channels == 2)
         speex_decode_stereo_int(output, s->frame_size, &s->stereo);

From b19e0c2b4e74349d3b362e48c57eb233f1880b28 Mon Sep 17 00:00:00 2001
From: Justin Ruggles <justin.ruggles@gmail.com>
Date: Fri, 21 Oct 2011 12:13:28 -0400
Subject: [PATCH 34/35] libspeexdec: include system headers before local
 headers

---
 libavcodec/libspeexdec.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libavcodec/libspeexdec.c b/libavcodec/libspeexdec.c
index f66331ea93..8bbae6c4f3 100644
--- a/libavcodec/libspeexdec.c
+++ b/libavcodec/libspeexdec.c
@@ -18,11 +18,11 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#include "avcodec.h"
 #include <speex/speex.h>
 #include <speex/speex_header.h>
 #include <speex/speex_stereo.h>
 #include <speex/speex_callbacks.h>
+#include "avcodec.h"
 
 typedef struct {
     SpeexBits bits;

From f4b51d061f0f34e36be876b562b8abe47f4b9c1c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Reimar=20D=C3=B6ffinger?= <Reimar.Doeffinger@gmx.de>
Date: Wed, 19 Oct 2011 18:41:02 +0200
Subject: [PATCH 35/35] flvdec: Do not call parse_keyframes_index with a NULL
 stream
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Martin Storsjö <martin@martin.st>
---
 libavformat/flvdec.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/libavformat/flvdec.c b/libavformat/flvdec.c
index 395c8f8a57..1459850f4a 100644
--- a/libavformat/flvdec.c
+++ b/libavformat/flvdec.c
@@ -239,8 +239,9 @@ static int amf_parse_object(AVFormatContext *s, AVStream *astream, AVStream *vst
         case AMF_DATA_TYPE_OBJECT: {
             unsigned int keylen;
 
-            if (key && !strcmp(KEYFRAMES_TAG, key) && depth == 1)
-                if (parse_keyframes_index(s, ioc, vstream, max_pos) < 0)
+            if ((vstream || astream) && key && !strcmp(KEYFRAMES_TAG, key) && depth == 1)
+                if (parse_keyframes_index(s, ioc, vstream ? vstream : astream,
+                                          max_pos) < 0)
                     return -1;
 
             while(avio_tell(ioc) < max_pos - 2 && (keylen = avio_rb16(ioc))) {