vp9: add keyframe profile 2/3 support.

10 years ago · b224b165cb
parent 346ce5da19
commit b224b165cb
10 changed files with 3033 additions and 2400 deletions
--- a/libavcodec/Makefile
+++ b/libavcodec/Makefile
@ -515,7 +515,8 @@ OBJS-$(CONFIG_VP6_DECODER)             += vp6.o vp56.o vp56data.o vp56dsp.o \
                                          vp6dsp.o vp56rac.o
 OBJS-$(CONFIG_VP7_DECODER)             += vp8.o vp8dsp.o vp56rac.o
 OBJS-$(CONFIG_VP8_DECODER)             += vp8.o vp8dsp.o vp56rac.o
-OBJS-$(CONFIG_VP9_DECODER)             += vp9.o vp9dsp.o vp56rac.o
+OBJS-$(CONFIG_VP9_DECODER)             += vp9.o vp9dsp.o vp56rac.o vp9dsp_8bpp.o \
                                          vp9dsp_10bpp.o vp9dsp_12bpp.o
 OBJS-$(CONFIG_VPLAYER_DECODER)         += textdec.o ass.o
 OBJS-$(CONFIG_VQA_DECODER)             += vqavideo.o
 OBJS-$(CONFIG_WAVPACK_DECODER)         += wavpack.o
--- a/libavcodec/vp9.c
+++ b/libavcodec/vp9.c
@ -109,6 +109,7 @@ typedef struct VP9Context {
    // bitstream header
    uint8_t keyframe, last_keyframe;
    uint8_t last_bpp, bpp, bpp_index, bytesperpixel;
    uint8_t invisible;
    uint8_t use_last_frame_mvs;
    uint8_t errorres;
@ -241,15 +242,15 @@ typedef struct VP9Context {
    // whole-frame cache
    uint8_t *intra_pred_data[3];
    struct VP9Filter *lflvl;
-    DECLARE_ALIGNED(32, uint8_t, edge_emu_buffer)[135*144];
+    DECLARE_ALIGNED(32, uint8_t, edge_emu_buffer)[135 * 144 * 2];
    // block reconstruction intermediates
    int block_alloc_using_2pass;
    int16_t *block_base, *block, *uvblock_base[2], *uvblock[2];
    uint8_t *eob_base, *uveob_base[2], *eob, *uveob[2];
    struct { int x, y; } min_mv, max_mv;
-    DECLARE_ALIGNED(32, uint8_t, tmp_y)[64 * 64];
+    DECLARE_ALIGNED(32, uint8_t, tmp_y)[64 * 64 * 2];
-    DECLARE_ALIGNED(32, uint8_t, tmp_uv)[2][64 * 64];
+    DECLARE_ALIGNED(32, uint8_t, tmp_uv)[2][64 * 64 * 2];
    uint16_t mvscale[3][2];
    uint8_t mvstep[3][2];
 } VP9Context;
@ -311,6 +312,7 @@ static int update_size(AVCodecContext *ctx, int w, int h, enum AVPixelFormat fmt
 {
    VP9Context *s = ctx->priv_data;
    uint8_t *p;
    int bytesperpixel = s->bytesperpixel;
    av_assert0(w > 0 && h > 0);
@ -329,12 +331,13 @@ static int update_size(AVCodecContext *ctx, int w, int h, enum AVPixelFormat fmt
    av_freep(&s->intra_pred_data[0]);
    // FIXME we slightly over-allocate here for subsampled chroma, but a little
    // bit of padding shouldn't affect performance...
-    p = av_malloc(s->sb_cols * (320 + sizeof(*s->lflvl) + 16 * sizeof(*s->above_mv_ctx)));
+    p = av_malloc(s->sb_cols * (128 + 192 * bytesperpixel +
                                sizeof(*s->lflvl) + 16 * sizeof(*s->above_mv_ctx)));
    if (!p)
        return AVERROR(ENOMEM);
-    assign(s->intra_pred_data[0],  uint8_t *,             64);
+    assign(s->intra_pred_data[0],  uint8_t *,             64 * bytesperpixel);
-    assign(s->intra_pred_data[1],  uint8_t *,             64);
+    assign(s->intra_pred_data[1],  uint8_t *,             64 * bytesperpixel);
-    assign(s->intra_pred_data[2],  uint8_t *,             64);
+    assign(s->intra_pred_data[2],  uint8_t *,             64 * bytesperpixel);
    assign(s->above_y_nnz_ctx,     uint8_t *,             16);
    assign(s->above_mode_ctx,      uint8_t *,             16);
    assign(s->above_mv_ctx,        VP56mv(*)[2],          16);
@ -355,13 +358,19 @@ static int update_size(AVCodecContext *ctx, int w, int h, enum AVPixelFormat fmt
    av_freep(&s->b_base);
    av_freep(&s->block_base);
    if (s->bpp != s->last_bpp) {
        ff_vp9dsp_init(&s->dsp, s->bpp);
        ff_videodsp_init(&s->vdsp, s->bpp);
        s->last_bpp = s->bpp;
    }
    return 0;
 }
 static int update_block_buffers(AVCodecContext *ctx)
 {
    VP9Context *s = ctx->priv_data;
-    int chroma_blocks, chroma_eobs;
+    int chroma_blocks, chroma_eobs, bytesperpixel = s->bytesperpixel;
    if (s->b_base && s->block_base && s->block_alloc_using_2pass == s->frames[CUR_FRAME].uses_2pass)
        return 0;
@ -374,24 +383,24 @@ static int update_block_buffers(AVCodecContext *ctx)
        int sbs = s->sb_cols * s->sb_rows;
        s->b_base = av_malloc_array(s->cols * s->rows, sizeof(VP9Block));
-        s->block_base = av_mallocz(((64 * 64 + 2 * chroma_blocks) * sizeof(int16_t) +
+        s->block_base = av_mallocz(((64 * 64 + 2 * chroma_blocks) * bytesperpixel * sizeof(int16_t) +
                                    16 * 16 + 2 * chroma_eobs) * sbs);
        if (!s->b_base || !s->block_base)
            return AVERROR(ENOMEM);
-        s->uvblock_base[0] = s->block_base + sbs * 64 * 64;
+        s->uvblock_base[0] = s->block_base + sbs * 64 * 64 * bytesperpixel;
-        s->uvblock_base[1] = s->uvblock_base[0] + sbs * chroma_blocks;
+        s->uvblock_base[1] = s->uvblock_base[0] + sbs * chroma_blocks * bytesperpixel;
-        s->eob_base = (uint8_t *) (s->uvblock_base[1] + sbs * chroma_blocks);
+        s->eob_base = (uint8_t *) (s->uvblock_base[1] + sbs * chroma_blocks * bytesperpixel);
        s->uveob_base[0] = s->eob_base + 16 * 16 * sbs;
        s->uveob_base[1] = s->uveob_base[0] + chroma_eobs * sbs;
    } else {
        s->b_base = av_malloc(sizeof(VP9Block));
-        s->block_base = av_mallocz((64 * 64 + 2 * chroma_blocks) * sizeof(int16_t) +
+        s->block_base = av_mallocz((64 * 64 + 2 * chroma_blocks) * bytesperpixel * sizeof(int16_t) +
                                   16 * 16 + 2 * chroma_eobs);
        if (!s->b_base || !s->block_base)
            return AVERROR(ENOMEM);
-        s->uvblock_base[0] = s->block_base + 64 * 64;
+        s->uvblock_base[0] = s->block_base + 64 * 64 * bytesperpixel;
-        s->uvblock_base[1] = s->uvblock_base[0] + chroma_blocks;
+        s->uvblock_base[1] = s->uvblock_base[0] + chroma_blocks * bytesperpixel;
-        s->eob_base = (uint8_t *) (s->uvblock_base[1] + chroma_blocks);
+        s->eob_base = (uint8_t *) (s->uvblock_base[1] + chroma_blocks * bytesperpixel);
        s->uveob_base[0] = s->eob_base + 16 * 16;
        s->uveob_base[1] = s->uveob_base[0] + chroma_eobs;
    }
@ -480,6 +489,9 @@ static enum AVPixelFormat read_colorspace_details(AVCodecContext *ctx)
    enum AVPixelFormat res;
    int bits = ctx->profile <= 1 ? 0 : 1 + get_bits1(&s->gb); // 0:8, 1:10, 2:12
    s->bpp_index = bits;
    s->bpp = 8 + bits * 2;
    s->bytesperpixel = (7 + s->bpp) >> 3;
    ctx->colorspace = colorspaces[get_bits(&s->gb, 3)];
    if (ctx->colorspace == AVCOL_SPC_RGB) { // RGB = profile 1
        static const enum AVPixelFormat pix_fmt_rgb[3] = {
@ -586,6 +598,9 @@ static int decode_frame_header(AVCodecContext *ctx,
                    return fmt;
            } else {
                s->ss_h = s->ss_v = 1;
                s->bpp = 8;
                s->bpp_index = 0;
                s->bytesperpixel = 1;
                fmt = AV_PIX_FMT_YUV420P;
                ctx->colorspace = AVCOL_SPC_BT470BG;
                ctx->color_range = AVCOL_RANGE_JPEG;
@ -765,10 +780,10 @@ static int decode_frame_header(AVCodecContext *ctx,
        quvac = av_clip_uintp2(qyac + s->uvac_qdelta, 8);
        qyac  = av_clip_uintp2(qyac, 8);
-        s->segmentation.feat[i].qmul[0][0] = vp9_dc_qlookup[qydc];
+        s->segmentation.feat[i].qmul[0][0] = vp9_dc_qlookup[s->bpp_index][qydc];
-        s->segmentation.feat[i].qmul[0][1] = vp9_ac_qlookup[qyac];
+        s->segmentation.feat[i].qmul[0][1] = vp9_ac_qlookup[s->bpp_index][qyac];
-        s->segmentation.feat[i].qmul[1][0] = vp9_dc_qlookup[quvdc];
+        s->segmentation.feat[i].qmul[1][0] = vp9_dc_qlookup[s->bpp_index][quvdc];
-        s->segmentation.feat[i].qmul[1][1] = vp9_ac_qlookup[quvac];
+        s->segmentation.feat[i].qmul[1][1] = vp9_ac_qlookup[s->bpp_index][quvac];
        sh = s->filter.level >= 32;
        if (s->segmentation.feat[i].lf_enabled) {
@ -2118,7 +2133,7 @@ static void decode_mode(AVCodecContext *ctx)
 // FIXME merge cnt/eob arguments?
 static av_always_inline int
 decode_coeffs_b_generic(VP56RangeCoder *c, int16_t *coef, int n_coeffs,
-                        int is_tx32x32, unsigned (*cnt)[6][3],
+                        int is_tx32x32, int is8bitsperpixel, int bpp, unsigned (*cnt)[6][3],
                        unsigned (*eob)[6][2], uint8_t (*p)[6][11],
                        int nnz, const int16_t *scan, const int16_t (*nb)[2],
                        const int16_t *band_counts, const int16_t *qmul)
@ -2194,7 +2209,16 @@ decode_coeffs_b_generic(VP56RangeCoder *c, int16_t *coef, int n_coeffs,
                    val +=      (vp56_rac_get_prob(c, 134) << 1);
                    val +=       vp56_rac_get_prob(c, 130);
                } else {
-                    val  = 67 + (vp56_rac_get_prob(c, 254) << 13);
+                    val = 67;
                    if (!is8bitsperpixel) {
                        if (bpp == 12) {
                            val += vp56_rac_get_prob(c, 255) << 17;
                            val += vp56_rac_get_prob(c, 255) << 16;
                        }
                        val +=  (vp56_rac_get_prob(c, 255) << 15);
                        val +=  (vp56_rac_get_prob(c, 255) << 14);
                    }
                    val +=      (vp56_rac_get_prob(c, 254) << 13);
                    val +=      (vp56_rac_get_prob(c, 254) << 12);
                    val +=      (vp56_rac_get_prob(c, 254) << 11);
                    val +=      (vp56_rac_get_prob(c, 252) << 10);
@ -2211,12 +2235,19 @@ decode_coeffs_b_generic(VP56RangeCoder *c, int16_t *coef, int n_coeffs,
                }
            }
        }
 #define STORE_COEF(c, i, v) do { \
    if (is8bitsperpixel) { \
        c[i] = v; \
    } else { \
        AV_WN32A(&c[i * 2], v); \
    } \
 } while (0)
        if (!--band_left)
            band_left = band_counts[++band];
        if (is_tx32x32)
-            coef[rc] = ((vp8_rac_get(c) ? -val : val) * qmul[!!i]) / 2;
+            STORE_COEF(coef, rc, ((vp8_rac_get(c) ? -val : val) * qmul[!!i]) / 2);
        else
-            coef[rc] = (vp8_rac_get(c) ? -val : val) * qmul[!!i];
+            STORE_COEF(coef, rc, (vp8_rac_get(c) ? -val : val) * qmul[!!i]);
        nnz = (1 + cache[nb[i][0]] + cache[nb[i][1]]) >> 1;
        tp = p[band][nnz];
    } while (++i < n_coeffs);
@ -2224,27 +2255,47 @@ decode_coeffs_b_generic(VP56RangeCoder *c, int16_t *coef, int n_coeffs,
    return i;
 }
-static int decode_coeffs_b(VP56RangeCoder *c, int16_t *coef, int n_coeffs,
+static int decode_coeffs_b_8bpp(VP9Context *s, int16_t *coef, int n_coeffs,
-                           unsigned (*cnt)[6][3], unsigned (*eob)[6][2],
+                                unsigned (*cnt)[6][3], unsigned (*eob)[6][2],
-                           uint8_t (*p)[6][11], int nnz, const int16_t *scan,
+                                uint8_t (*p)[6][11], int nnz, const int16_t *scan,
-                           const int16_t (*nb)[2], const int16_t *band_counts,
+                                const int16_t (*nb)[2], const int16_t *band_counts,
-                           const int16_t *qmul)
+                                const int16_t *qmul)
 {
    return decode_coeffs_b_generic(&s->c, coef, n_coeffs, 0, 1, 8, cnt, eob, p,
                                   nnz, scan, nb, band_counts, qmul);
 }
 static int decode_coeffs_b32_8bpp(VP9Context *s, int16_t *coef, int n_coeffs,
                                  unsigned (*cnt)[6][3], unsigned (*eob)[6][2],
                                  uint8_t (*p)[6][11], int nnz, const int16_t *scan,
                                  const int16_t (*nb)[2], const int16_t *band_counts,
                                  const int16_t *qmul)
 {
-    return decode_coeffs_b_generic(c, coef, n_coeffs, 0, cnt, eob, p,
+    return decode_coeffs_b_generic(&s->c, coef, n_coeffs, 1, 1, 8, cnt, eob, p,
                                   nnz, scan, nb, band_counts, qmul);
 }
-static int decode_coeffs_b32(VP56RangeCoder *c, int16_t *coef, int n_coeffs,
+static int decode_coeffs_b_16bpp(VP9Context *s, int16_t *coef, int n_coeffs,
-                             unsigned (*cnt)[6][3], unsigned (*eob)[6][2],
+                                 unsigned (*cnt)[6][3], unsigned (*eob)[6][2],
-                             uint8_t (*p)[6][11], int nnz, const int16_t *scan,
+                                 uint8_t (*p)[6][11], int nnz, const int16_t *scan,
-                             const int16_t (*nb)[2], const int16_t *band_counts,
+                                 const int16_t (*nb)[2], const int16_t *band_counts,
-                             const int16_t *qmul)
+                                 const int16_t *qmul)
 {
-    return decode_coeffs_b_generic(c, coef, n_coeffs, 1, cnt, eob, p,
+    return decode_coeffs_b_generic(&s->c, coef, n_coeffs, 0, 0, s->bpp, cnt, eob, p,
                                   nnz, scan, nb, band_counts, qmul);
 }
-static void decode_coeffs(AVCodecContext *ctx)
+static int decode_coeffs_b32_16bpp(VP9Context *s, int16_t *coef, int n_coeffs,
                                   unsigned (*cnt)[6][3], unsigned (*eob)[6][2],
                                   uint8_t (*p)[6][11], int nnz, const int16_t *scan,
                                   const int16_t (*nb)[2], const int16_t *band_counts,
                                   const int16_t *qmul)
 {
    return decode_coeffs_b_generic(&s->c, coef, n_coeffs, 1, 0, s->bpp, cnt, eob, p,
                                   nnz, scan, nb, band_counts, qmul);
 }
 static av_always_inline void decode_coeffs(AVCodecContext *ctx, int is8bitsperpixel)
 {
    VP9Context *s = ctx->priv_data;
    VP9Block *b = s->b;
@ -2272,6 +2323,7 @@ static void decode_coeffs(AVCodecContext *ctx)
    };
    const int16_t *y_band_counts = band_counts[b->tx];
    const int16_t *uv_band_counts = band_counts[b->uvtx];
    int bytesperpixel = is8bitsperpixel ? 1 : 2;
 #define MERGE(la, end, step, rd) \
    for (n = 0; n < end; n += step) \
@ -2286,7 +2338,8 @@ static void decode_coeffs(AVCodecContext *ctx)
    for (n = 0, y = 0; y < end_y; y += step) { \
        for (x = 0; x < end_x; x += step, n += step * step) { \
            enum TxfmType txtp = vp9_intra_txfm_type[b->mode[mode_index]]; \
-            res = decode_coeffs_b##v(&s->c, s->block + 16 * n, 16 * step * step, \
+            res = (is8bitsperpixel ? decode_coeffs_b##v##_8bpp : decode_coeffs_b##v##_16bpp) \
                                    (s, s->block + 16 * n * bytesperpixel, 16 * step * step, \
                                     c, e, p, a[x] + l[y], yscans[txtp], \
                                     ynbs[txtp], y_band_counts, qmul[0]); \
            a[x] = l[y] = !!res; \
@ -2355,12 +2408,13 @@ static void decode_coeffs(AVCodecContext *ctx)
        break;
    }
-#define DECODE_UV_COEF_LOOP(step, decode_coeffs_fn) \
+#define DECODE_UV_COEF_LOOP(step, v) \
    for (n = 0, y = 0; y < end_y; y += step) { \
        for (x = 0; x < end_x; x += step, n += step * step) { \
-            res = decode_coeffs_fn(&s->c, s->uvblock[pl] + 16 * n, \
+            res = (is8bitsperpixel ? decode_coeffs_b##v##_8bpp : decode_coeffs_b##v##_16bpp) \
-                                   16 * step * step, c, e, p, a[x] + l[y], \
+                                    (s, s->uvblock[pl] + 16 * n * bytesperpixel, \
-                                   uvscan, uvnb, uv_band_counts, qmul[1]); \
+                                     16 * step * step, c, e, p, a[x] + l[y], \
                                     uvscan, uvnb, uv_band_counts, qmul[1]); \
            a[x] = l[y] = !!res; \
            if (step >= 4) { \
                AV_WN16A(&s->uveob[pl][n], res); \
@ -2382,37 +2436,48 @@ static void decode_coeffs(AVCodecContext *ctx)
        l = &s->left_uv_nnz_ctx[pl][(row & 7) << !s->ss_v];
        switch (b->uvtx) {
        case TX_4X4:
-            DECODE_UV_COEF_LOOP(1, decode_coeffs_b);
+            DECODE_UV_COEF_LOOP(1,);
            break;
        case TX_8X8:
            MERGE_CTX(2, AV_RN16A);
-            DECODE_UV_COEF_LOOP(2, decode_coeffs_b);
+            DECODE_UV_COEF_LOOP(2,);
            SPLAT_CTX(2);
            break;
        case TX_16X16:
            MERGE_CTX(4, AV_RN32A);
-            DECODE_UV_COEF_LOOP(4, decode_coeffs_b);
+            DECODE_UV_COEF_LOOP(4,);
            SPLAT_CTX(4);
            break;
        case TX_32X32:
            MERGE_CTX(8, AV_RN64A);
-            DECODE_UV_COEF_LOOP(8, decode_coeffs_b32);
+            DECODE_UV_COEF_LOOP(8, 32);
            SPLAT_CTX(8);
            break;
        }
    }
 }
 static void decode_coeffs_8bpp(AVCodecContext *ctx)
 {
    decode_coeffs(ctx, 1);
 }
 static void decode_coeffs_16bpp(AVCodecContext *ctx)
 {
    decode_coeffs(ctx, 0);
 }
 static av_always_inline int check_intra_mode(VP9Context *s, int mode, uint8_t **a,
                                             uint8_t *dst_edge, ptrdiff_t stride_edge,
                                             uint8_t *dst_inner, ptrdiff_t stride_inner,
                                             uint8_t *l, int col, int x, int w,
                                             int row, int y, enum TxfmMode tx,
-                                             int p, int ss_h, int ss_v)
+                                             int p, int ss_h, int ss_v, int bytesperpixel)
 {
    int have_top = row > 0 || y > 0;
    int have_left = col > s->tiling.tile_col_start || x > 0;
    int have_right = x < w - 1;
    int bpp = s->bpp;
    static const uint8_t mode_conv[10][2 /* have_left */][2 /* have_top */] = {
        [VERT_PRED]            = { { DC_127_PRED,          VERT_PRED },
                                   { DC_127_PRED,          VERT_PRED } },
@ -2474,11 +2539,11 @@ static av_always_inline int check_intra_mode(VP9Context *s, int mode, uint8_t **
        // post-loopfilter data)
        if (have_top) {
            top = !(row & 7) && !y ?
-                s->intra_pred_data[p] + col * (8 >> ss_h) + x * 4 :
+                s->intra_pred_data[p] + (col * (8 >> ss_h) + x * 4) * bytesperpixel :
                y == 0 ? &dst_edge[-stride_edge] : &dst_inner[-stride_inner];
            if (have_left)
                topleft = !(row & 7) && !y ?
-                    s->intra_pred_data[p] + col * (8 >> ss_h) + x * 4 :
+                    s->intra_pred_data[p] + (col * (8 >> ss_h) + x * 4) * bytesperpixel :
                    y == 0 || x == 0 ? &dst_edge[-stride_edge] :
                    &dst_inner[-stride_inner];
        }
@ -2491,28 +2556,61 @@ static av_always_inline int check_intra_mode(VP9Context *s, int mode, uint8_t **
        } else {
            if (have_top) {
                if (n_px_need <= n_px_have) {
-                    memcpy(*a, top, n_px_need);
+                    memcpy(*a, top, n_px_need * bytesperpixel);
                } else {
-                    memcpy(*a, top, n_px_have);
+#define memset_bpp(c, i1, v, i2, num) do { \
-                    memset(&(*a)[n_px_have], (*a)[n_px_have - 1],
+    if (bytesperpixel == 1) { \
-                           n_px_need - n_px_have);
+        memset(&(c)[(i1)], (v)[(i2)], (num)); \
    } else { \
        int n, val = AV_RN16A(&(v)[(i2) * 2]); \
        for (n = 0; n < (num); n++) { \
            AV_WN16A(&(c)[((i1) + n) * 2], val); \
        } \
    } \
 } while (0)
                    memcpy(*a, top, n_px_have * bytesperpixel);
                    memset_bpp(*a, n_px_have, (*a), n_px_have - 1, n_px_need - n_px_have);
                }
            } else {
-                memset(*a, 127, n_px_need);
+#define memset_val(c, val, num) do { \
    if (bytesperpixel == 1) { \
        memset((c), (val), (num)); \
    } else { \
        int n; \
        for (n = 0; n < (num); n++) { \
            AV_WN16A(&(c)[n * 2], (val)); \
        } \
    } \
 } while (0)
                memset_val(*a, (128 << (bpp - 8)) - 1, n_px_need);
            }
            if (edges[mode].needs_topleft) {
                if (have_left && have_top) {
-                    (*a)[-1] = topleft[-1];
+#define assign_bpp(c, i1, v, i2) do { \
    if (bytesperpixel == 1) { \
        (c)[(i1)] = (v)[(i2)]; \
    } else { \
        AV_COPY16(&(c)[(i1) * 2], &(v)[(i2) * 2]); \
    } \
 } while (0)
                    assign_bpp(*a, -1, topleft, -1);
                } else {
-                    (*a)[-1] = have_top ? 129 : 127;
+#define assign_val(c, i, v) do { \
    if (bytesperpixel == 1) { \
        (c)[(i)] = (v); \
    } else { \
        AV_WN16A(&(c)[(i) * 2], (v)); \
    } \
 } while (0)
                    assign_val((*a), -1, (128 << (bpp - 8)) + (have_top ? +1 : -1));
                }
            }
            if (tx == TX_4X4 && edges[mode].needs_topright) {
                if (have_top && have_right &&
                    n_px_need + n_px_need_tr <= n_px_have) {
-                    memcpy(&(*a)[4], &top[4], 4);
+                    memcpy(&(*a)[4 * bytesperpixel], &top[4 * bytesperpixel], 4 * bytesperpixel);
                } else {
-                    memset(&(*a)[4], (*a)[3], 4);
+                    memset_bpp(*a, 4, *a, 3, 4);
                }
            }
        }
@ -2526,31 +2624,32 @@ static av_always_inline int check_intra_mode(VP9Context *s, int mode, uint8_t **
            if (edges[mode].invert_left) {
                if (n_px_need <= n_px_have) {
                    for (i = 0; i < n_px_need; i++)
-                        l[i] = dst[i * stride - 1];
+                        assign_bpp(l, i, &dst[i * stride], -1);
                } else {
                    for (i = 0; i < n_px_have; i++)
-                        l[i] = dst[i * stride - 1];
+                        assign_bpp(l, i, &dst[i * stride], -1);
-                    memset(&l[n_px_have], l[n_px_have - 1], n_px_need - n_px_have);
+                    memset_bpp(l, n_px_have, l, n_px_have - 1, n_px_need - n_px_have);
                }
            } else {
                if (n_px_need <= n_px_have) {
                    for (i = 0; i < n_px_need; i++)
-                        l[n_px_need - 1 - i] = dst[i * stride - 1];
+                        assign_bpp(l, n_px_need - 1 - i, &dst[i * stride], -1);
                } else {
                    for (i = 0; i < n_px_have; i++)
-                        l[n_px_need - 1 - i] = dst[i * stride - 1];
+                        assign_bpp(l, n_px_need - 1 - i, &dst[i * stride], -1);
-                    memset(l, l[n_px_need - n_px_have], n_px_need - n_px_have);
+                    memset_bpp(l, 0, l, n_px_need - n_px_have, n_px_need - n_px_have);
                }
            }
        } else {
-            memset(l, 129, 4 << tx);
+            memset_val(l, (128 << (bpp - 8)) + 1, 4 << tx);
        }
    }
    return mode;
 }
-static void intra_recon(AVCodecContext *ctx, ptrdiff_t y_off, ptrdiff_t uv_off)
+static av_always_inline void intra_recon(AVCodecContext *ctx, ptrdiff_t y_off,
                                         ptrdiff_t uv_off, int bytesperpixel)
 {
    VP9Context *s = ctx->priv_data;
    VP9Block *b = s->b;
@ -2562,13 +2661,13 @@ static void intra_recon(AVCodecContext *ctx, ptrdiff_t y_off, ptrdiff_t uv_off)
    int tx = 4 * s->lossless + b->tx, uvtx = b->uvtx + 4 * s->lossless;
    int uvstep1d = 1 << b->uvtx, p;
    uint8_t *dst = s->dst[0], *dst_r = s->frames[CUR_FRAME].tf.f->data[0] + y_off;
-    LOCAL_ALIGNED_32(uint8_t, a_buf, [64]);
+    LOCAL_ALIGNED_32(uint8_t, a_buf, [96]);
-    LOCAL_ALIGNED_32(uint8_t, l, [32]);
+    LOCAL_ALIGNED_32(uint8_t, l, [64]);
    for (n = 0, y = 0; y < end_y; y += step1d) {
        uint8_t *ptr = dst, *ptr_r = dst_r;
-        for (x = 0; x < end_x; x += step1d, ptr += 4 * step1d,
+        for (x = 0; x < end_x; x += step1d, ptr += 4 * step1d * bytesperpixel,
-                               ptr_r += 4 * step1d, n += step) {
+                               ptr_r += 4 * step1d * bytesperpixel, n += step) {
            int mode = b->mode[b->bs > BS_8x8 && b->tx == TX_4X4 ?
                               y * 2 + x : 0];
            uint8_t *a = &a_buf[32];
@ -2578,11 +2677,11 @@ static void intra_recon(AVCodecContext *ctx, ptrdiff_t y_off, ptrdiff_t uv_off)
            mode = check_intra_mode(s, mode, &a, ptr_r,
                                    s->frames[CUR_FRAME].tf.f->linesize[0],
                                    ptr, s->y_stride, l,
-                                    col, x, w4, row, y, b->tx, 0, 0, 0);
+                                    col, x, w4, row, y, b->tx, 0, 0, 0, bytesperpixel);
            s->dsp.intra_pred[b->tx][mode](ptr, s->y_stride, l, a);
            if (eob)
                s->dsp.itxfm_add[tx][txtp](ptr, s->y_stride,
-                                           s->block + 16 * n, eob);
+                                           s->block + 16 * n * bytesperpixel, eob);
        }
        dst_r += 4 * step1d * s->frames[CUR_FRAME].tf.f->linesize[0];
        dst   += 4 * step1d * s->y_stride;
@ -2598,8 +2697,8 @@ static void intra_recon(AVCodecContext *ctx, ptrdiff_t y_off, ptrdiff_t uv_off)
        dst_r = s->frames[CUR_FRAME].tf.f->data[1 + p] + uv_off;
        for (n = 0, y = 0; y < end_y; y += uvstep1d) {
            uint8_t *ptr = dst, *ptr_r = dst_r;
-            for (x = 0; x < end_x; x += uvstep1d, ptr += 4 * uvstep1d,
+            for (x = 0; x < end_x; x += uvstep1d, ptr += 4 * uvstep1d * bytesperpixel,
-                                   ptr_r += 4 * uvstep1d, n += step) {
+                                   ptr_r += 4 * uvstep1d * bytesperpixel, n += step) {
                int mode = b->uvmode;
                uint8_t *a = &a_buf[32];
                int eob = b->skip ? 0 : b->uvtx > TX_8X8 ? AV_RN16A(&s->uveob[p][n]) : s->uveob[p][n];
@ -2607,11 +2706,11 @@ static void intra_recon(AVCodecContext *ctx, ptrdiff_t y_off, ptrdiff_t uv_off)
                mode = check_intra_mode(s, mode, &a, ptr_r,
                                        s->frames[CUR_FRAME].tf.f->linesize[1],
                                        ptr, s->uv_stride, l, col, x, w4, row, y,
-                                        b->uvtx, p + 1, s->ss_h, s->ss_v);
+                                        b->uvtx, p + 1, s->ss_h, s->ss_v, bytesperpixel);
                s->dsp.intra_pred[b->uvtx][mode](ptr, s->uv_stride, l, a);
                if (eob)
                    s->dsp.itxfm_add[uvtx][DCT_DCT](ptr, s->uv_stride,
-                                                    s->uvblock[p] + 16 * n, eob);
+                                                    s->uvblock[p] + 16 * n * bytesperpixel, eob);
            }
            dst_r += 4 * uvstep1d * s->frames[CUR_FRAME].tf.f->linesize[1];
            dst   += 4 * uvstep1d * s->uv_stride;
@ -2619,6 +2718,16 @@ static void intra_recon(AVCodecContext *ctx, ptrdiff_t y_off, ptrdiff_t uv_off)
    }
 }
 static void intra_recon_8bpp(AVCodecContext *ctx, ptrdiff_t y_off, ptrdiff_t uv_off)
 {
    intra_recon(ctx, y_off, uv_off, 1);
 }
 static void intra_recon_16bpp(AVCodecContext *ctx, ptrdiff_t y_off, ptrdiff_t uv_off)
 {
    intra_recon(ctx, y_off, uv_off, 2);
 }
 static av_always_inline void mc_luma_scaled(VP9Context *s, vp9_scaled_mc_func smc,
                                            uint8_t *dst, ptrdiff_t dst_stride,
                                            const uint8_t *ref, ptrdiff_t ref_stride,
@ -2996,6 +3105,7 @@ static void decode_b(AVCodecContext *ctx, int row, int col,
    VP9Context *s = ctx->priv_data;
    VP9Block *b = s->b;
    enum BlockSize bs = bl * 3 + bp;
    int bytesperpixel = s->bytesperpixel;
    int w4 = bwh_tab[1][bs][0], h4 = bwh_tab[1][bs][1], lvl;
    int emu[2];
    AVFrame *f = s->frames[CUR_FRAME].tf.f;
@ -3017,7 +3127,11 @@ static void decode_b(AVCodecContext *ctx, int row, int col,
                           (s->ss_v && h4 * 2 == (1 << b->tx)));
        if (!b->skip) {
-            decode_coeffs(ctx);
+            if (bytesperpixel == 1) {
                decode_coeffs_8bpp(ctx);
            } else {
                decode_coeffs_16bpp(ctx);
            }
        } else {
            int row7 = s->row7;
@ -3056,9 +3170,9 @@ static void decode_b(AVCodecContext *ctx, int row, int col,
        }
        if (s->pass == 1) {
            s->b++;
-            s->block += w4 * h4 * 64;
+            s->block += w4 * h4 * 64 * bytesperpixel;
-            s->uvblock[0] += w4 * h4 * 64 >> (s->ss_h + s->ss_v);
+            s->uvblock[0] += w4 * h4 * 64 * bytesperpixel >> (s->ss_h + s->ss_v);
-            s->uvblock[1] += w4 * h4 * 64 >> (s->ss_h + s->ss_v);
+            s->uvblock[1] += w4 * h4 * 64 * bytesperpixel >> (s->ss_h + s->ss_v);
            s->eob += 4 * w4 * h4;
            s->uveob[0] += 4 * w4 * h4 >> (s->ss_h + s->ss_v);
            s->uveob[1] += 4 * w4 * h4 >> (s->ss_h + s->ss_v);
@ -3076,7 +3190,7 @@ static void decode_b(AVCodecContext *ctx, int row, int col,
             (row + h4) > s->rows;
    if (emu[0]) {
        s->dst[0] = s->tmp_y;
-        s->y_stride = 64;
+        s->y_stride = 128;
    } else {
        s->dst[0] = f->data[0] + yoff;
        s->y_stride = f->linesize[0];
@ -3084,14 +3198,18 @@ static void decode_b(AVCodecContext *ctx, int row, int col,
    if (emu[1]) {
        s->dst[1] = s->tmp_uv[0];
        s->dst[2] = s->tmp_uv[1];
-        s->uv_stride = 32;
+        s->uv_stride = 128;
    } else {
        s->dst[1] = f->data[1] + uvoff;
        s->dst[2] = f->data[2] + uvoff;
        s->uv_stride = f->linesize[1];
    }
    if (b->intra) {
-        intra_recon(ctx, yoff, uvoff);
+        if (s->bpp > 8) {
            intra_recon_16bpp(ctx, yoff, uvoff);
        } else {
            intra_recon_8bpp(ctx, yoff, uvoff);
        }
    } else {
        inter_recon(ctx);
    }
@ -3104,13 +3222,14 @@ static void decode_b(AVCodecContext *ctx, int row, int col,
            av_assert2(n <= 4);
            if (w & bw) {
                s->dsp.mc[n][0][0][0][0](f->data[0] + yoff + o, f->linesize[0],
-                                         s->tmp_y + o, 64, h, 0, 0);
+                                         s->tmp_y + o, 128, h, 0, 0);
-                o += bw;
+                o += bw * bytesperpixel;
            }
        }
    }
    if (emu[1]) {
-        int w = FFMIN(s->cols - col, w4) * 4, h = FFMIN(s->rows - row, h4) * 4, n, o = 0;
+        int w = FFMIN(s->cols - col, w4) * 8 >> s->ss_h;
        int h = FFMIN(s->rows - row, h4) * 8 >> s->ss_v, n, o = 0;
        for (n = 1; o < w; n++) {
            int bw = 64 >> n;
@ -3118,10 +3237,10 @@ static void decode_b(AVCodecContext *ctx, int row, int col,
            av_assert2(n <= 4);
            if (w & bw) {
                s->dsp.mc[n][0][0][0][0](f->data[1] + uvoff + o, f->linesize[1],
-                                         s->tmp_uv[0] + o, 32, h, 0, 0);
+                                         s->tmp_uv[0] + o, 128, h, 0, 0);
                s->dsp.mc[n][0][0][0][0](f->data[2] + uvoff + o, f->linesize[2],
-                                         s->tmp_uv[1] + o, 32, h, 0, 0);
+                                         s->tmp_uv[1] + o, 128, h, 0, 0);
-                o += bw;
+                o += bw * bytesperpixel;
            }
        }
    }
@ -3158,9 +3277,9 @@ static void decode_b(AVCodecContext *ctx, int row, int col,
    if (s->pass == 2) {
        s->b++;
-        s->block += w4 * h4 * 64;
+        s->block += w4 * h4 * 64 * bytesperpixel;
-        s->uvblock[0] += w4 * h4 * 64 >> (s->ss_v + s->ss_h);
+        s->uvblock[0] += w4 * h4 * 64 * bytesperpixel >> (s->ss_v + s->ss_h);
-        s->uvblock[1] += w4 * h4 * 64 >> (s->ss_v + s->ss_h);
+        s->uvblock[1] += w4 * h4 * 64 * bytesperpixel >> (s->ss_v + s->ss_h);
        s->eob += 4 * w4 * h4;
        s->uveob[0] += 4 * w4 * h4 >> (s->ss_v + s->ss_h);
        s->uveob[1] += 4 * w4 * h4 >> (s->ss_v + s->ss_h);
@ -3179,6 +3298,7 @@ static void decode_sb(AVCodecContext *ctx, int row, int col, struct VP9Filter *l
    ptrdiff_t hbs = 4 >> bl;
    AVFrame *f = s->frames[CUR_FRAME].tf.f;
    ptrdiff_t y_stride = f->linesize[0], uv_stride = f->linesize[1];
    int bytesperpixel = s->bytesperpixel;
    if (bl == BL_8X8) {
        bp = vp8_rac_get_tree(&s->c, vp9_partition_tree, p);
@ -3198,19 +3318,21 @@ static void decode_sb(AVCodecContext *ctx, int row, int col, struct VP9Filter *l
                break;
            case PARTITION_V:
                decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
-                yoff  += hbs * 8;
+                yoff  += hbs * 8 * bytesperpixel;
-                uvoff += hbs * 8 >> s->ss_h;
+                uvoff += hbs * 8 * bytesperpixel >> s->ss_h;
                decode_b(ctx, row, col + hbs, lflvl, yoff, uvoff, bl, bp);
                break;
            case PARTITION_SPLIT:
                decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
                decode_sb(ctx, row, col + hbs, lflvl,
-                          yoff + 8 * hbs, uvoff + (8 * hbs >> s->ss_h), bl + 1);
+                          yoff + 8 * hbs * bytesperpixel,
                          uvoff + (8 * hbs * bytesperpixel >> s->ss_h), bl + 1);
                yoff  += hbs * 8 * y_stride;
                uvoff += hbs * 8 * uv_stride >> s->ss_v;
                decode_sb(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
                decode_sb(ctx, row + hbs, col + hbs, lflvl,
-                          yoff + 8 * hbs, uvoff + (8 * hbs >> s->ss_h), bl + 1);
+                          yoff + 8 * hbs * bytesperpixel,
                          uvoff + (8 * hbs * bytesperpixel >> s->ss_h), bl + 1);
                break;
            default:
                av_assert0(0);
@ -3219,7 +3341,8 @@ static void decode_sb(AVCodecContext *ctx, int row, int col, struct VP9Filter *l
            bp = PARTITION_SPLIT;
            decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
            decode_sb(ctx, row, col + hbs, lflvl,
-                      yoff + 8 * hbs, uvoff + (8 * hbs >> s->ss_h), bl + 1);
+                      yoff + 8 * hbs * bytesperpixel,
                      uvoff + (8 * hbs * bytesperpixel >> s->ss_h), bl + 1);
        } else {
            bp = PARTITION_H;
            decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
@ -3250,6 +3373,7 @@ static void decode_sb_mem(AVCodecContext *ctx, int row, int col, struct VP9Filte
    ptrdiff_t hbs = 4 >> bl;
    AVFrame *f = s->frames[CUR_FRAME].tf.f;
    ptrdiff_t y_stride = f->linesize[0], uv_stride = f->linesize[1];
    int bytesperpixel = s->bytesperpixel;
    if (bl == BL_8X8) {
        av_assert2(b->bl == BL_8X8);
@ -3261,24 +3385,25 @@ static void decode_sb_mem(AVCodecContext *ctx, int row, int col, struct VP9Filte
            uvoff += hbs * 8 * uv_stride >> s->ss_v;
            decode_b(ctx, row + hbs, col, lflvl, yoff, uvoff, b->bl, b->bp);
        } else if (b->bp == PARTITION_V && col + hbs < s->cols) {
-            yoff  += hbs * 8;
+            yoff  += hbs * 8 * bytesperpixel;
-            uvoff += hbs * 8 >> s->ss_h;
+            uvoff += hbs * 8 * bytesperpixel >> s->ss_h;
            decode_b(ctx, row, col + hbs, lflvl, yoff, uvoff, b->bl, b->bp);
        }
    } else {
        decode_sb_mem(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
        if (col + hbs < s->cols) { // FIXME why not <=?
            if (row + hbs < s->rows) {
-                decode_sb_mem(ctx, row, col + hbs, lflvl, yoff + 8 * hbs,
+                decode_sb_mem(ctx, row, col + hbs, lflvl, yoff + 8 * hbs * bytesperpixel,
-                              uvoff + (8 * hbs >> s->ss_h), bl + 1);
+                              uvoff + (8 * hbs * bytesperpixel >> s->ss_h), bl + 1);
                yoff  += hbs * 8 * y_stride;
                uvoff += hbs * 8 * uv_stride >> s->ss_v;
                decode_sb_mem(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
                decode_sb_mem(ctx, row + hbs, col + hbs, lflvl,
-                                    yoff + 8 * hbs, uvoff + (8 * hbs >> s->ss_h), bl + 1);
+                              yoff + 8 * hbs * bytesperpixel,
                              uvoff + (8 * hbs * bytesperpixel >> s->ss_h), bl + 1);
            } else {
-                yoff  += hbs * 8;
+                yoff  += hbs * 8 * bytesperpixel;
-                uvoff += hbs * 8 >> s->ss_h;
+                uvoff += hbs * 8 * bytesperpixel >> s->ss_h;
                decode_sb_mem(ctx, row, col + hbs, lflvl, yoff, uvoff, bl + 1);
            }
        } else if (row + hbs < s->rows) {
@ -3293,7 +3418,7 @@ static av_always_inline void filter_plane_cols(VP9Context *s, int col, int ss_h,
                                               uint8_t *lvl, uint8_t (*mask)[4],
                                               uint8_t *dst, ptrdiff_t ls)
 {
-    int y, x;
+    int y, x, bytesperpixel = s->bytesperpixel;
    // filter edges between columns (e.g. block1 | block2)
    for (y = 0; y < 8; y += 2 << ss_v, dst += 16 * ls, lvl += 16 << ss_v) {
@ -3302,7 +3427,7 @@ static av_always_inline void filter_plane_cols(VP9Context *s, int col, int ss_h,
        unsigned hm2 = hmask2[1] | hmask2[2], hm23 = hmask2[3];
        unsigned hm = hm1 | hm2 | hm13 | hm23;
-        for (x = 1; hm & ~(x - 1); x <<= 1, ptr += 8 >> ss_h) {
+        for (x = 1; hm & ~(x - 1); x <<= 1, ptr += 8 * bytesperpixel >> ss_h) {
            if (col || x > 1) {
                if (hm1 & x) {
                    int L = *l, H = L >> 4;
@ -3348,15 +3473,15 @@ static av_always_inline void filter_plane_cols(VP9Context *s, int col, int ss_h,
                        H |= (L >> 4) << 8;
                        E |= s->filter.mblim_lut[L] << 8;
                        I |= s->filter.lim_lut[L] << 8;
-                        s->dsp.loop_filter_mix2[0][0][0](ptr + 4, ls, E, I, H);
+                        s->dsp.loop_filter_mix2[0][0][0](ptr + 4 * bytesperpixel, ls, E, I, H);
                    } else {
-                        s->dsp.loop_filter_8[0][0](ptr + 4, ls, E, I, H);
+                        s->dsp.loop_filter_8[0][0](ptr + 4 * bytesperpixel, ls, E, I, H);
                    }
                } else if (hm23 & x) {
                    int L = l[8 << ss_v], H = L >> 4;
                    int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
-                    s->dsp.loop_filter_8[0][0](ptr + 8 * ls + 4, ls, E, I, H);
+                    s->dsp.loop_filter_8[0][0](ptr + 8 * ls + 4 * bytesperpixel, ls, E, I, H);
                }
                l++;
            }
@ -3368,7 +3493,7 @@ static av_always_inline void filter_plane_rows(VP9Context *s, int row, int ss_h,
                                               uint8_t *lvl, uint8_t (*mask)[4],
                                               uint8_t *dst, ptrdiff_t ls)
 {
-    int y, x;
+    int y, x, bytesperpixel = s->bytesperpixel;
    //                                 block1
    // filter edges between rows (e.g. ------)
@ -3377,7 +3502,7 @@ static av_always_inline void filter_plane_rows(VP9Context *s, int row, int ss_h,
        uint8_t *ptr = dst, *l = lvl, *vmask = mask[y];
        unsigned vm = vmask[0] | vmask[1] | vmask[2], vm3 = vmask[3];
-        for (x = 1; vm & ~(x - 1); x <<= (2 << ss_h), ptr += 16, l += 2 << ss_h) {
+        for (x = 1; vm & ~(x - 1); x <<= (2 << ss_h), ptr += 16 * bytesperpixel, l += 2 << ss_h) {
            if (row || y) {
                if (vm & x) {
                    int L = *l, H = L >> 4;
@ -3407,7 +3532,7 @@ static av_always_inline void filter_plane_rows(VP9Context *s, int row, int ss_h,
                    int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
                    s->dsp.loop_filter_8[!!(vmask[1] & (x << (1 + ss_h)))]
-                                        [1](ptr + 8, ls, E, I, H);
+                                        [1](ptr + 8 * bytesperpixel, ls, E, I, H);
                }
            }
            if (!ss_v) {
@ -3428,7 +3553,7 @@ static av_always_inline void filter_plane_rows(VP9Context *s, int row, int ss_h,
                    int L = l[1 + ss_h], H = L >> 4;
                    int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
-                    s->dsp.loop_filter_8[0][1](ptr + ls * 4 + 8, ls, E, I, H);
+                    s->dsp.loop_filter_8[0][1](ptr + ls * 4 + 8 * bytesperpixel, ls, E, I, H);
                }
            }
        }
@ -3758,6 +3883,7 @@ static int vp9_decode_frame(AVCodecContext *ctx, void *frame,
    int retain_segmap_ref = s->segmentation.enabled && !s->segmentation.update_map;
    ptrdiff_t yoff, uvoff, ls_y, ls_uv;
    AVFrame *f;
    int bytesperpixel;
    if ((res = decode_frame_header(ctx, data, size, &ref)) < 0) {
        return res;
@ -3819,6 +3945,7 @@ static int vp9_decode_frame(AVCodecContext *ctx, void *frame,
    }
    // main tile decode loop
    bytesperpixel = s->bytesperpixel;
    memset(s->above_partition_ctx, 0, s->cols);
    memset(s->above_skip_ctx, 0, s->cols);
    if (s->keyframe || s->intraonly) {
@ -3921,7 +4048,8 @@ static int vp9_decode_frame(AVCodecContext *ctx, void *frame,
                    for (col = s->tiling.tile_col_start;
                         col < s->tiling.tile_col_end;
-                         col += 8, yoff2 += 64, uvoff2 += 64 >> s->ss_h, lflvl_ptr++) {
+                         col += 8, yoff2 += 64 * bytesperpixel,
                         uvoff2 += 64 * bytesperpixel >> s->ss_h, lflvl_ptr++) {
                        // FIXME integrate with lf code (i.e. zero after each
                        // use, similar to invtxfm coefficients, or similar)
                        if (s->pass != 1) {
@ -3950,13 +4078,13 @@ static int vp9_decode_frame(AVCodecContext *ctx, void *frame,
                if (row + 8 < s->rows) {
                    memcpy(s->intra_pred_data[0],
                           f->data[0] + yoff + 63 * ls_y,
-                           8 * s->cols);
+                           8 * s->cols * bytesperpixel);
                    memcpy(s->intra_pred_data[1],
                           f->data[1] + uvoff + ((64 >> s->ss_v) - 1) * ls_uv,
-                           8 * s->cols >> s->ss_h);
+                           8 * s->cols * bytesperpixel >> s->ss_h);
                    memcpy(s->intra_pred_data[2],
                           f->data[2] + uvoff + ((64 >> s->ss_v) - 1) * ls_uv,
-                           8 * s->cols >> s->ss_h);
+                           8 * s->cols * bytesperpixel >> s->ss_h);
                }
                // loopfilter one row
@ -3965,7 +4093,8 @@ static int vp9_decode_frame(AVCodecContext *ctx, void *frame,
                    uvoff2 = uvoff;
                    lflvl_ptr = s->lflvl;
                    for (col = 0; col < s->cols;
-                         col += 8, yoff2 += 64, uvoff2 += 64 >> s->ss_h, lflvl_ptr++) {
+                         col += 8, yoff2 += 64 * bytesperpixel,
                         uvoff2 += 64 * bytesperpixel >> s->ss_h, lflvl_ptr++) {
                        loopfilter_sb(ctx, lflvl_ptr, row, col, yoff2, uvoff2);
                    }
                }
@ -4042,8 +4171,7 @@ static av_cold int vp9_decode_init(AVCodecContext *ctx)
    VP9Context *s = ctx->priv_data;
    ctx->internal->allocate_progress = 1;
-    ff_vp9dsp_init(&s->dsp);
+    s->last_bpp = 0;
    ff_videodsp_init(&s->vdsp, 8);
    s->filter.sharpness = -1;
    return init_frames(ctx);
--- a/libavcodec/vp9data.h
+++ b/libavcodec/vp9data.h
@ -236,74 +236,210 @@ static const enum FilterMode vp9_filter_lut[3] = {
    FILTER_8TAP_SHARP,
 };
-static const int16_t vp9_dc_qlookup[256] = {
+static const int16_t vp9_dc_qlookup[3][256] = {
-       4,    8,    8,    9,   10,   11,   12,   12,
+    {
-      13,   14,   15,   16,   17,   18,   19,   19,
+            4,     8,     8,     9,    10,    11,    12,    12,
-      20,   21,   22,   23,   24,   25,   26,   26,
+           13,    14,    15,    16,    17,    18,    19,    19,
-      27,   28,   29,   30,   31,   32,   32,   33,
+           20,    21,    22,    23,    24,    25,    26,    26,
-      34,   35,   36,   37,   38,   38,   39,   40,
+           27,    28,    29,    30,    31,    32,    32,    33,
-      41,   42,   43,   43,   44,   45,   46,   47,
+           34,    35,    36,    37,    38,    38,    39,    40,
-      48,   48,   49,   50,   51,   52,   53,   53,
+           41,    42,    43,    43,    44,    45,    46,    47,
-      54,   55,   56,   57,   57,   58,   59,   60,
+           48,    48,    49,    50,    51,    52,    53,    53,
-      61,   62,   62,   63,   64,   65,   66,   66,
+           54,    55,    56,    57,    57,    58,    59,    60,
-      67,   68,   69,   70,   70,   71,   72,   73,
+           61,    62,    62,    63,    64,    65,    66,    66,
-      74,   74,   75,   76,   77,   78,   78,   79,
+           67,    68,    69,    70,    70,    71,    72,    73,
-      80,   81,   81,   82,   83,   84,   85,   85,
+           74,    74,    75,    76,    77,    78,    78,    79,
-      87,   88,   90,   92,   93,   95,   96,   98,
+           80,    81,    81,    82,    83,    84,    85,    85,
-      99,  101,  102,  104,  105,  107,  108,  110,
+           87,    88,    90,    92,    93,    95,    96,    98,
-     111,  113,  114,  116,  117,  118,  120,  121,
+           99,   101,   102,   104,   105,   107,   108,   110,
-     123,  125,  127,  129,  131,  134,  136,  138,
+          111,   113,   114,   116,   117,   118,   120,   121,
-     140,  142,  144,  146,  148,  150,  152,  154,
+          123,   125,   127,   129,   131,   134,   136,   138,
-     156,  158,  161,  164,  166,  169,  172,  174,
+          140,   142,   144,   146,   148,   150,   152,   154,
-     177,  180,  182,  185,  187,  190,  192,  195,
+          156,   158,   161,   164,   166,   169,   172,   174,
-     199,  202,  205,  208,  211,  214,  217,  220,
+          177,   180,   182,   185,   187,   190,   192,   195,
-     223,  226,  230,  233,  237,  240,  243,  247,
+          199,   202,   205,   208,   211,   214,   217,   220,
-     250,  253,  257,  261,  265,  269,  272,  276,
+          223,   226,   230,   233,   237,   240,   243,   247,
-     280,  284,  288,  292,  296,  300,  304,  309,
+          250,   253,   257,   261,   265,   269,   272,   276,
-     313,  317,  322,  326,  330,  335,  340,  344,
+          280,   284,   288,   292,   296,   300,   304,   309,
-     349,  354,  359,  364,  369,  374,  379,  384,
+          313,   317,   322,   326,   330,   335,   340,   344,
-     389,  395,  400,  406,  411,  417,  423,  429,
+          349,   354,   359,   364,   369,   374,   379,   384,
-     435,  441,  447,  454,  461,  467,  475,  482,
+          389,   395,   400,   406,   411,   417,   423,   429,
-     489,  497,  505,  513,  522,  530,  539,  549,
+          435,   441,   447,   454,   461,   467,   475,   482,
-     559,  569,  579,  590,  602,  614,  626,  640,
+          489,   497,   505,   513,   522,   530,   539,   549,
-     654,  668,  684,  700,  717,  736,  755,  775,
+          559,   569,   579,   590,   602,   614,   626,   640,
-     796,  819,  843,  869,  896,  925,  955,  988,
+          654,   668,   684,   700,   717,   736,   755,   775,
-    1022, 1058, 1098, 1139, 1184, 1232, 1282, 1336,
+          796,   819,   843,   869,   896,   925,   955,   988,
         1022,  1058,  1098,  1139,  1184,  1232,  1282,  1336,
    }, {
            4,     9,    10,    13,    15,    17,    20,    22,
           25,    28,    31,    34,    37,    40,    43,    47,
           50,    53,    57,    60,    64,    68,    71,    75,
           78,    82,    86,    90,    93,    97,   101,   105,
          109,   113,   116,   120,   124,   128,   132,   136,
          140,   143,   147,   151,   155,   159,   163,   166,
          170,   174,   178,   182,   185,   189,   193,   197,
          200,   204,   208,   212,   215,   219,   223,   226,
          230,   233,   237,   241,   244,   248,   251,   255,
          259,   262,   266,   269,   273,   276,   280,   283,
          287,   290,   293,   297,   300,   304,   307,   310,
          314,   317,   321,   324,   327,   331,   334,   337,
          343,   350,   356,   362,   369,   375,   381,   387,
          394,   400,   406,   412,   418,   424,   430,   436,
          442,   448,   454,   460,   466,   472,   478,   484,
          490,   499,   507,   516,   525,   533,   542,   550,
          559,   567,   576,   584,   592,   601,   609,   617,
          625,   634,   644,   655,   666,   676,   687,   698,
          708,   718,   729,   739,   749,   759,   770,   782,
          795,   807,   819,   831,   844,   856,   868,   880,
          891,   906,   920,   933,   947,   961,   975,   988,
         1001,  1015,  1030,  1045,  1061,  1076,  1090,  1105,
         1120,  1137,  1153,  1170,  1186,  1202,  1218,  1236,
         1253,  1271,  1288,  1306,  1323,  1342,  1361,  1379,
         1398,  1416,  1436,  1456,  1476,  1496,  1516,  1537,
         1559,  1580,  1601,  1624,  1647,  1670,  1692,  1717,
         1741,  1766,  1791,  1817,  1844,  1871,  1900,  1929,
         1958,  1990,  2021,  2054,  2088,  2123,  2159,  2197,
         2236,  2276,  2319,  2363,  2410,  2458,  2508,  2561,
         2616,  2675,  2737,  2802,  2871,  2944,  3020,  3102,
         3188,  3280,  3375,  3478,  3586,  3702,  3823,  3953,
         4089,  4236,  4394,  4559,  4737,  4929,  5130,  5347,
    }, {
            4,    12,    18,    25,    33,    41,    50,    60,
           70,    80,    91,   103,   115,   127,   140,   153,
          166,   180,   194,   208,   222,   237,   251,   266,
          281,   296,   312,   327,   343,   358,   374,   390,
          405,   421,   437,   453,   469,   484,   500,   516,
          532,   548,   564,   580,   596,   611,   627,   643,
          659,   674,   690,   706,   721,   737,   752,   768,
          783,   798,   814,   829,   844,   859,   874,   889,
          904,   919,   934,   949,   964,   978,   993,  1008,
         1022,  1037,  1051,  1065,  1080,  1094,  1108,  1122,
         1136,  1151,  1165,  1179,  1192,  1206,  1220,  1234,
         1248,  1261,  1275,  1288,  1302,  1315,  1329,  1342,
         1368,  1393,  1419,  1444,  1469,  1494,  1519,  1544,
         1569,  1594,  1618,  1643,  1668,  1692,  1717,  1741,
         1765,  1789,  1814,  1838,  1862,  1885,  1909,  1933,
         1957,  1992,  2027,  2061,  2096,  2130,  2165,  2199,
         2233,  2267,  2300,  2334,  2367,  2400,  2434,  2467,
         2499,  2532,  2575,  2618,  2661,  2704,  2746,  2788,
         2830,  2872,  2913,  2954,  2995,  3036,  3076,  3127,
         3177,  3226,  3275,  3324,  3373,  3421,  3469,  3517,
         3565,  3621,  3677,  3733,  3788,  3843,  3897,  3951,
         4005,  4058,  4119,  4181,  4241,  4301,  4361,  4420,
         4479,  4546,  4612,  4677,  4742,  4807,  4871,  4942,
         5013,  5083,  5153,  5222,  5291,  5367,  5442,  5517,
         5591,  5665,  5745,  5825,  5905,  5984,  6063,  6149,
         6234,  6319,  6404,  6495,  6587,  6678,  6769,  6867,
         6966,  7064,  7163,  7269,  7376,  7483,  7599,  7715,
         7832,  7958,  8085,  8214,  8352,  8492,  8635,  8788,
         8945,  9104,  9275,  9450,  9639,  9832, 10031, 10245,
        10465, 10702, 10946, 11210, 11482, 11776, 12081, 12409,
        12750, 13118, 13501, 13913, 14343, 14807, 15290, 15812,
        16356, 16943, 17575, 18237, 18949, 19718, 20521, 21387,
    }
 };
-static const int16_t vp9_ac_qlookup[256] = {
+static const int16_t vp9_ac_qlookup[3][256] = {
-       4,    8,    9,   10,   11,   12,   13,   14,
+    {
-      15,   16,   17,   18,   19,   20,   21,   22,
+            4,     8,     9,    10,    11,    12,    13,    14,
-      23,   24,   25,   26,   27,   28,   29,   30,
+           15,    16,    17,    18,    19,    20,    21,    22,
-      31,   32,   33,   34,   35,   36,   37,   38,
+           23,    24,    25,    26,    27,    28,    29,    30,
-      39,   40,   41,   42,   43,   44,   45,   46,
+           31,    32,    33,    34,    35,    36,    37,    38,
-      47,   48,   49,   50,   51,   52,   53,   54,
+           39,    40,    41,    42,    43,    44,    45,    46,
-      55,   56,   57,   58,   59,   60,   61,   62,
+           47,    48,    49,    50,    51,    52,    53,    54,
-      63,   64,   65,   66,   67,   68,   69,   70,
+           55,    56,    57,    58,    59,    60,    61,    62,
-      71,   72,   73,   74,   75,   76,   77,   78,
+           63,    64,    65,    66,    67,    68,    69,    70,
-      79,   80,   81,   82,   83,   84,   85,   86,
+           71,    72,    73,    74,    75,    76,    77,    78,
-      87,   88,   89,   90,   91,   92,   93,   94,
+           79,    80,    81,    82,    83,    84,    85,    86,
-      95,   96,   97,   98,   99,  100,  101,  102,
+           87,    88,    89,    90,    91,    92,    93,    94,
-     104,  106,  108,  110,  112,  114,  116,  118,
+           95,    96,    97,    98,    99,   100,   101,   102,
-     120,  122,  124,  126,  128,  130,  132,  134,
+          104,   106,   108,   110,   112,   114,   116,   118,
-     136,  138,  140,  142,  144,  146,  148,  150,
+          120,   122,   124,   126,   128,   130,   132,   134,
-     152,  155,  158,  161,  164,  167,  170,  173,
+          136,   138,   140,   142,   144,   146,   148,   150,
-     176,  179,  182,  185,  188,  191,  194,  197,
+          152,   155,   158,   161,   164,   167,   170,   173,
-     200,  203,  207,  211,  215,  219,  223,  227,
+          176,   179,   182,   185,   188,   191,   194,   197,
-     231,  235,  239,  243,  247,  251,  255,  260,
+          200,   203,   207,   211,   215,   219,   223,   227,
-     265,  270,  275,  280,  285,  290,  295,  300,
+          231,   235,   239,   243,   247,   251,   255,   260,
-     305,  311,  317,  323,  329,  335,  341,  347,
+          265,   270,   275,   280,   285,   290,   295,   300,
-     353,  359,  366,  373,  380,  387,  394,  401,
+          305,   311,   317,   323,   329,   335,   341,   347,
-     408,  416,  424,  432,  440,  448,  456,  465,
+          353,   359,   366,   373,   380,   387,   394,   401,
-     474,  483,  492,  501,  510,  520,  530,  540,
+          408,   416,   424,   432,   440,   448,   456,   465,
-     550,  560,  571,  582,  593,  604,  615,  627,
+          474,   483,   492,   501,   510,   520,   530,   540,
-     639,  651,  663,  676,  689,  702,  715,  729,
+          550,   560,   571,   582,   593,   604,   615,   627,
-     743,  757,  771,  786,  801,  816,  832,  848,
+          639,   651,   663,   676,   689,   702,   715,   729,
-     864,  881,  898,  915,  933,  951,  969,  988,
+          743,   757,   771,   786,   801,   816,   832,   848,
-    1007, 1026, 1046, 1066, 1087, 1108, 1129, 1151,
+          864,   881,   898,   915,   933,   951,   969,   988,
-    1173, 1196, 1219, 1243, 1267, 1292, 1317, 1343,
+         1007,  1026,  1046,  1066,  1087,  1108,  1129,  1151,
-    1369, 1396, 1423, 1451, 1479, 1508, 1537, 1567,
+         1173,  1196,  1219,  1243,  1267,  1292,  1317,  1343,
-    1597, 1628, 1660, 1692, 1725, 1759, 1793, 1828,
+         1369,  1396,  1423,  1451,  1479,  1508,  1537,  1567,
         1597,  1628,  1660,  1692,  1725,  1759,  1793,  1828,
    }, {
            4,     9,    11,    13,    16,    18,    21,    24,
           27,    30,    33,    37,    40,    44,    48,    51,
           55,    59,    63,    67,    71,    75,    79,    83,
           88,    92,    96,   100,   105,   109,   114,   118,
          122,   127,   131,   136,   140,   145,   149,   154,
          158,   163,   168,   172,   177,   181,   186,   190,
          195,   199,   204,   208,   213,   217,   222,   226,
          231,   235,   240,   244,   249,   253,   258,   262,
          267,   271,   275,   280,   284,   289,   293,   297,
          302,   306,   311,   315,   319,   324,   328,   332,
          337,   341,   345,   349,   354,   358,   362,   367,
          371,   375,   379,   384,   388,   392,   396,   401,
          409,   417,   425,   433,   441,   449,   458,   466,
          474,   482,   490,   498,   506,   514,   523,   531,
          539,   547,   555,   563,   571,   579,   588,   596,
          604,   616,   628,   640,   652,   664,   676,   688,
          700,   713,   725,   737,   749,   761,   773,   785,
          797,   809,   825,   841,   857,   873,   889,   905,
          922,   938,   954,   970,   986,  1002,  1018,  1038,
         1058,  1078,  1098,  1118,  1138,  1158,  1178,  1198,
         1218,  1242,  1266,  1290,  1314,  1338,  1362,  1386,
         1411,  1435,  1463,  1491,  1519,  1547,  1575,  1603,
         1631,  1663,  1695,  1727,  1759,  1791,  1823,  1859,
         1895,  1931,  1967,  2003,  2039,  2079,  2119,  2159,
         2199,  2239,  2283,  2327,  2371,  2415,  2459,  2507,
         2555,  2603,  2651,  2703,  2755,  2807,  2859,  2915,
         2971,  3027,  3083,  3143,  3203,  3263,  3327,  3391,
         3455,  3523,  3591,  3659,  3731,  3803,  3876,  3952,
         4028,  4104,  4184,  4264,  4348,  4432,  4516,  4604,
         4692,  4784,  4876,  4972,  5068,  5168,  5268,  5372,
         5476,  5584,  5692,  5804,  5916,  6032,  6148,  6268,
         6388,  6512,  6640,  6768,  6900,  7036,  7172,  7312,
    }, {
            4,    13,    19,    27,    35,    44,    54,    64,
           75,    87,    99,   112,   126,   139,   154,   168,
          183,   199,   214,   230,   247,   263,   280,   297,
          314,   331,   349,   366,   384,   402,   420,   438,
          456,   475,   493,   511,   530,   548,   567,   586,
          604,   623,   642,   660,   679,   698,   716,   735,
          753,   772,   791,   809,   828,   846,   865,   884,
          902,   920,   939,   957,   976,   994,  1012,  1030,
         1049,  1067,  1085,  1103,  1121,  1139,  1157,  1175,
         1193,  1211,  1229,  1246,  1264,  1282,  1299,  1317,
         1335,  1352,  1370,  1387,  1405,  1422,  1440,  1457,
         1474,  1491,  1509,  1526,  1543,  1560,  1577,  1595,
         1627,  1660,  1693,  1725,  1758,  1791,  1824,  1856,
         1889,  1922,  1954,  1987,  2020,  2052,  2085,  2118,
         2150,  2183,  2216,  2248,  2281,  2313,  2346,  2378,
         2411,  2459,  2508,  2556,  2605,  2653,  2701,  2750,
         2798,  2847,  2895,  2943,  2992,  3040,  3088,  3137,
         3185,  3234,  3298,  3362,  3426,  3491,  3555,  3619,
         3684,  3748,  3812,  3876,  3941,  4005,  4069,  4149,
         4230,  4310,  4390,  4470,  4550,  4631,  4711,  4791,
         4871,  4967,  5064,  5160,  5256,  5352,  5448,  5544,
         5641,  5737,  5849,  5961,  6073,  6185,  6297,  6410,
         6522,  6650,  6778,  6906,  7034,  7162,  7290,  7435,
         7579,  7723,  7867,  8011,  8155,  8315,  8475,  8635,
         8795,  8956,  9132,  9308,  9484,  9660,  9836, 10028,
        10220, 10412, 10604, 10812, 11020, 11228, 11437, 11661,
        11885, 12109, 12333, 12573, 12813, 13053, 13309, 13565,
        13821, 14093, 14365, 14637, 14925, 15213, 15502, 15806,
        16110, 16414, 16734, 17054, 17390, 17726, 18062, 18414,
        18766, 19134, 19502, 19886, 20270, 20670, 21070, 21486,
        21902, 22334, 22766, 23214, 23662, 24126, 24590, 25070,
        25551, 26047, 26559, 27071, 27599, 28143, 28687, 29247,
    }
 };
 static const enum TxfmType vp9_intra_txfm_type[14] = {
--- a/libavcodec/vp9dsp.c
+++ b/libavcodec/vp9dsp.c
--- a/libavcodec/vp9dsp.h
+++ b/libavcodec/vp9dsp.h
@ -120,8 +120,12 @@ typedef struct VP9DSPContext {
    vp9_scaled_mc_func smc[5][4][2];
 } VP9DSPContext;
-void ff_vp9dsp_init(VP9DSPContext *dsp);
+void ff_vp9dsp_init(VP9DSPContext *dsp, int bpp);
-void ff_vp9dsp_init_x86(VP9DSPContext *dsp);
+void ff_vp9dsp_init_8(VP9DSPContext *dsp);
 void ff_vp9dsp_init_10(VP9DSPContext *dsp);
 void ff_vp9dsp_init_12(VP9DSPContext *dsp);
 void ff_vp9dsp_init_x86(VP9DSPContext *dsp, int bpp);
 #endif /* AVCODEC_VP9DSP_H */
--- a/libavcodec/vp9dsp_10bpp.c
+++ b/libavcodec/vp9dsp_10bpp.c
@ -0,0 +1,26 @@
 /*
 * VP9 compatible video decoder
 *
 * Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com>
 * Copyright (C) 2013 Clément Bœsch <u pkh me>
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */
 #define BIT_DEPTH 10
 #define dctint int64_t
 #include "vp9dsp_template.c"
--- a/libavcodec/vp9dsp_12bpp.c
+++ b/libavcodec/vp9dsp_12bpp.c
@ -0,0 +1,26 @@
 /*
 * VP9 compatible video decoder
 *
 * Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com>
 * Copyright (C) 2013 Clément Bœsch <u pkh me>
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */
 #define BIT_DEPTH 12
 #define dctint int64_t
 #include "vp9dsp_template.c"
--- a/libavcodec/vp9dsp_8bpp.c
+++ b/libavcodec/vp9dsp_8bpp.c
@ -0,0 +1,26 @@
 /*
 * VP9 compatible video decoder
 *
 * Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com>
 * Copyright (C) 2013 Clément Bœsch <u pkh me>
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */
 #define BIT_DEPTH 8
 #define dctint int
 #include "vp9dsp_template.c"
--- a/libavcodec/vp9dsp_template.c
+++ b/libavcodec/vp9dsp_template.c
--- a/libavcodec/x86/vp9dsp_init.c
+++ b/libavcodec/x86/vp9dsp_init.c
@ -307,8 +307,10 @@ ipred_func(32, tm, avx2);
 #endif /* HAVE_YASM */
-av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp)
+av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp, int bpp)
 {
    if (bpp != 8) return;
 #if HAVE_YASM
    int cpu_flags = av_get_cpu_flags();