From 8305041e137f4f2a49669dd588bf6ccfbbac2b58 Mon Sep 17 00:00:00 2001 From: "Ronald S. Bultje" Date: Wed, 19 Oct 2011 19:56:56 -0700 Subject: [PATCH 01/35] swscale: prevent overflow in coefficient calculation. --- libswscale/utils.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/libswscale/utils.c b/libswscale/utils.c index 8e5daf99dc..ea5a1ab468 100644 --- a/libswscale/utils.c +++ b/libswscale/utils.c @@ -271,19 +271,20 @@ static int initFilter(int16_t **outFilter, int16_t **filterPos, int *outFilterSi floatd= d * (1.0/(1<<30)); if (flags & SWS_BICUBIC) { -#define SQRT_INT64_MAX 0xb504f333 int64_t B= (param[0] != SWS_PARAM_DEFAULT ? param[0] : 0) * (1<<24); int64_t C= (param[1] != SWS_PARAM_DEFAULT ? param[1] : 0.6) * (1<<24); - int64_t dd = d > SQRT_INT64_MAX ? ((d >> 1) * d) >> 29 : (d * d) >> 30; - int64_t ddd = d > SQRT_INT64_MAX || dd > SQRT_INT64_MAX ? - ((dd >> 2) * d) >> 28 : (dd * d) >> 30; - - if (d < 1LL<<30) - coeff = (12*(1<<24)-9*B-6*C)*ddd + (-18*(1<<24)+12*B+6*C)*dd + (6*(1<<24)-2*B)*(1<<30); - else if (d < 1LL<<31) - coeff = (-B-6*C)*ddd + (6*B+30*C)*dd + (-12*B-48*C)*d + (8*B+24*C)*(1<<30); - else - coeff=0.0; + + if (d >= 1LL<<31) { + coeff = 0.0; + } else { + int64_t dd = (d * d) >> 30; + int64_t ddd = (dd * d) >> 30; + + if (d < 1LL<<30) + coeff = (12*(1<<24)-9*B-6*C)*ddd + (-18*(1<<24)+12*B+6*C)*dd + (6*(1<<24)-2*B)*(1<<30); + else + coeff = (-B-6*C)*ddd + (6*B+30*C)*dd + (-12*B-48*C)*d + (8*B+24*C)*(1<<30); + } coeff *= fone>>(30+24); } /* else if (flags & SWS_X) { From ce42a04884cd6585c596f1ecfe737dacc3e6f396 Mon Sep 17 00:00:00 2001 From: "Ronald S. Bultje" Date: Mon, 17 Oct 2011 17:10:16 -0700 Subject: [PATCH 02/35] vp8: fix up handling of segmentation_maps in reference frames. Associate segmentation_map[] with reference frame, rather than decoding instance. This fixes cases where the map would be free()'ed on e.g. a size change in one thread, whereas the other thread was still accessing it. Also, it fixes cases where threads overwrite data that is still being referenced by the previous thread, who thinks that it's part of the frame previously decoded by the next thread. --- libavcodec/vp8.c | 68 +++++++++++++++++++++++++++++++++++++++--------- libavcodec/vp8.h | 11 +++++++- 2 files changed, 65 insertions(+), 14 deletions(-) diff --git a/libavcodec/vp8.c b/libavcodec/vp8.c index 7442b99252..d5cdaba486 100644 --- a/libavcodec/vp8.c +++ b/libavcodec/vp8.c @@ -41,24 +41,57 @@ static void free_buffers(VP8Context *s) av_freep(&s->top_nnz); av_freep(&s->edge_emu_buffer); av_freep(&s->top_border); - av_freep(&s->segmentation_map); s->macroblocks = NULL; } -static void vp8_decode_flush(AVCodecContext *avctx) +static int vp8_alloc_frame(VP8Context *s, AVFrame *f) +{ + int ret; + if ((ret = ff_thread_get_buffer(s->avctx, f)) < 0) + return ret; + if (!s->maps_are_invalid && s->num_maps_to_be_freed) { + f->ref_index[0] = s->segmentation_maps[--s->num_maps_to_be_freed]; + } else if (!(f->ref_index[0] = av_mallocz(s->mb_width * s->mb_height))) { + ff_thread_release_buffer(s->avctx, f); + return AVERROR(ENOMEM); + } + return 0; +} + +static void vp8_release_frame(VP8Context *s, AVFrame *f, int is_close) +{ + if (!is_close) { + if (f->ref_index[0]) { + assert(s->num_maps_to_be_freed < FF_ARRAY_ELEMS(s->segmentation_maps)); + s->segmentation_maps[s->num_maps_to_be_freed++] = f->ref_index[0]; + f->ref_index[0] = NULL; + } + } else { + av_freep(&f->ref_index[0]); + } + ff_thread_release_buffer(s->avctx, f); +} + +static void vp8_decode_flush_impl(AVCodecContext *avctx, int force, int is_close) { VP8Context *s = avctx->priv_data; int i; - if (!avctx->is_copy) { + if (!avctx->is_copy || force) { for (i = 0; i < 5; i++) if (s->frames[i].data[0]) - ff_thread_release_buffer(avctx, &s->frames[i]); + vp8_release_frame(s, &s->frames[i], is_close); } memset(s->framep, 0, sizeof(s->framep)); free_buffers(s); + s->maps_are_invalid = 1; +} + +static void vp8_decode_flush(AVCodecContext *avctx) +{ + vp8_decode_flush_impl(avctx, 0, 0); } static int update_dimensions(VP8Context *s, int width, int height) @@ -68,7 +101,7 @@ static int update_dimensions(VP8Context *s, int width, int height) if (av_image_check_size(width, height, 0, s->avctx)) return AVERROR_INVALIDDATA; - vp8_decode_flush(s->avctx); + vp8_decode_flush_impl(s->avctx, 1, 0); avcodec_set_dimensions(s->avctx, width, height); } @@ -81,10 +114,9 @@ static int update_dimensions(VP8Context *s, int width, int height) s->intra4x4_pred_mode_top = av_mallocz(s->mb_width*4); s->top_nnz = av_mallocz(s->mb_width*sizeof(*s->top_nnz)); s->top_border = av_mallocz((s->mb_width+1)*sizeof(*s->top_border)); - s->segmentation_map = av_mallocz(s->mb_width*s->mb_height); if (!s->macroblocks_base || !s->filter_strength || !s->intra4x4_pred_mode_top || - !s->top_nnz || !s->top_border || !s->segmentation_map) + !s->top_nnz || !s->top_border) return AVERROR(ENOMEM); s->macroblocks = s->macroblocks_base + 1; @@ -1508,6 +1540,14 @@ static void filter_mb_row_simple(VP8Context *s, AVFrame *curframe, int mb_y) } } +static void release_queued_segmaps(VP8Context *s, int is_close) +{ + int leave_behind = is_close ? 0 : !s->maps_are_invalid; + while (s->num_maps_to_be_freed > leave_behind) + av_freep(&s->segmentation_maps[--s->num_maps_to_be_freed]); + s->maps_are_invalid = 0; +} + static int vp8_decode_frame(AVCodecContext *avctx, void *data, int *data_size, AVPacket *avpkt) { @@ -1516,6 +1556,8 @@ static int vp8_decode_frame(AVCodecContext *avctx, void *data, int *data_size, enum AVDiscard skip_thresh; AVFrame *av_uninit(curframe), *prev_frame = s->framep[VP56_FRAME_CURRENT]; + release_queued_segmaps(s, 0); + if ((ret = decode_frame_header(s, avpkt->data, avpkt->size)) < 0) return ret; @@ -1538,7 +1580,7 @@ static int vp8_decode_frame(AVCodecContext *avctx, void *data, int *data_size, &s->frames[i] != s->framep[VP56_FRAME_PREVIOUS] && &s->frames[i] != s->framep[VP56_FRAME_GOLDEN] && &s->frames[i] != s->framep[VP56_FRAME_GOLDEN2]) - ff_thread_release_buffer(avctx, &s->frames[i]); + vp8_release_frame(s, &s->frames[i], 0); // find a free buffer for (i = 0; i < 5; i++) @@ -1559,8 +1601,7 @@ static int vp8_decode_frame(AVCodecContext *avctx, void *data, int *data_size, curframe->key_frame = s->keyframe; curframe->pict_type = s->keyframe ? AV_PICTURE_TYPE_I : AV_PICTURE_TYPE_P; curframe->reference = referenced ? 3 : 0; - curframe->ref_index[0] = s->segmentation_map; - if ((ret = ff_thread_get_buffer(avctx, curframe))) { + if ((ret = vp8_alloc_frame(s, curframe))) { av_log(avctx, AV_LOG_ERROR, "get_buffer() failed!\n"); return ret; } @@ -1652,8 +1693,8 @@ static int vp8_decode_frame(AVCodecContext *avctx, void *data, int *data_size, s->dsp.prefetch(dst[0] + (mb_x&3)*4*s->linesize + 64, s->linesize, 4); s->dsp.prefetch(dst[1] + (mb_x&7)*s->uvlinesize + 64, dst[2] - dst[1], 2); - decode_mb_mode(s, mb, mb_x, mb_y, s->segmentation_map + mb_xy, - prev_frame ? prev_frame->ref_index[0] + mb_xy : NULL); + decode_mb_mode(s, mb, mb_x, mb_y, curframe->ref_index[0] + mb_xy, + prev_frame && prev_frame->ref_index[0] ? prev_frame->ref_index[0] + mb_xy : NULL); prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_PREVIOUS); @@ -1736,7 +1777,8 @@ static av_cold int vp8_decode_init(AVCodecContext *avctx) static av_cold int vp8_decode_free(AVCodecContext *avctx) { - vp8_decode_flush(avctx); + vp8_decode_flush_impl(avctx, 0, 1); + release_queued_segmaps(avctx->priv_data, 1); return 0; } diff --git a/libavcodec/vp8.h b/libavcodec/vp8.h index 5a96cd436c..6cbdca2d88 100644 --- a/libavcodec/vp8.h +++ b/libavcodec/vp8.h @@ -130,7 +130,6 @@ typedef struct { uint8_t *intra4x4_pred_mode_top; uint8_t intra4x4_pred_mode_left[4]; - uint8_t *segmentation_map; /** * Macroblocks can have one of 4 different quants in a frame when @@ -237,6 +236,16 @@ typedef struct { H264PredContext hpc; vp8_mc_func put_pixels_tab[3][3][3]; AVFrame frames[5]; + + /** + * A list of segmentation_map buffers that are to be free()'ed in + * the next decoding iteration. We can't free() them right away + * because the map may still be used by subsequent decoding threads. + * Unused if frame threading is off. + */ + uint8_t *segmentation_maps[5]; + int num_maps_to_be_freed; + int maps_are_invalid; } VP8Context; #endif /* AVCODEC_VP8_H */ From e85297e0e7396c9724e57ac949f10e5eb345bf54 Mon Sep 17 00:00:00 2001 From: "Ronald S. Bultje" Date: Fri, 21 Oct 2011 00:21:31 -0700 Subject: [PATCH 03/35] Revert d1d421cbc0d13b08535f7fc08d179572ee352072: change to fate-lavfi-crop_scale. --- tests/ref/lavfi/crop_scale | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/ref/lavfi/crop_scale b/tests/ref/lavfi/crop_scale index ae26c8ee6d..82e5394496 100644 --- a/tests/ref/lavfi/crop_scale +++ b/tests/ref/lavfi/crop_scale @@ -1 +1 @@ -crop_scale f8cad857d2b7102fc256532ec9849da7 +crop_scale 0a3d45d58b805b8c47416b9239535f94 From dc49bf127010fdff2c3282755407cedd429475f5 Mon Sep 17 00:00:00 2001 From: "Ronald S. Bultje" Date: Fri, 21 Oct 2011 00:38:04 -0700 Subject: [PATCH 04/35] sws/pixfmt/pixdesc: add support for yuv420p9le/be. --- libavcodec/utils.c | 2 ++ libavutil/pixdesc.c | 23 +++++++++++++++++++++++ libavutil/pixfmt.h | 3 +++ libswscale/swscale.c | 4 ++++ libswscale/swscale_internal.h | 4 ++++ libswscale/utils.c | 2 ++ tests/ref/lavfi/pixdesc | 2 ++ tests/ref/lavfi/pixfmts_copy | 2 ++ tests/ref/lavfi/pixfmts_null | 2 ++ tests/ref/lavfi/pixfmts_scale | 2 ++ tests/ref/lavfi/pixfmts_vflip | 2 ++ 11 files changed, 48 insertions(+) diff --git a/libavcodec/utils.c b/libavcodec/utils.c index c1a5c19e04..3e4926273f 100644 --- a/libavcodec/utils.c +++ b/libavcodec/utils.c @@ -158,6 +158,8 @@ void avcodec_align_dimensions2(AVCodecContext *s, int *width, int *height, int l case PIX_FMT_YUV420P9BE: case PIX_FMT_YUV420P10LE: case PIX_FMT_YUV420P10BE: + case PIX_FMT_YUV422P9LE: + case PIX_FMT_YUV422P9BE: case PIX_FMT_YUV422P10LE: case PIX_FMT_YUV422P10BE: case PIX_FMT_YUV444P9LE: diff --git a/libavutil/pixdesc.c b/libavutil/pixdesc.c index c70a41347b..f7df0eba65 100644 --- a/libavutil/pixdesc.c +++ b/libavutil/pixdesc.c @@ -809,6 +809,29 @@ const AVPixFmtDescriptor av_pix_fmt_descriptors[PIX_FMT_NB] = { }, .flags = PIX_FMT_BE, }, + [PIX_FMT_YUV422P9LE] = { + .name = "yuv422p9le", + .nb_components= 3, + .log2_chroma_w= 1, + .log2_chroma_h= 0, + .comp = { + {0,1,1,0,8}, /* Y */ + {1,1,1,0,8}, /* U */ + {2,1,1,0,8}, /* V */ + }, + }, + [PIX_FMT_YUV422P9BE] = { + .name = "yuv422p9be", + .nb_components= 3, + .log2_chroma_w= 1, + .log2_chroma_h= 0, + .comp = { + {0,1,1,0,8}, /* Y */ + {1,1,1,0,8}, /* U */ + {2,1,1,0,8}, /* V */ + }, + .flags = PIX_FMT_BE, + }, [PIX_FMT_YUV422P10LE] = { .name = "yuv422p10le", .nb_components= 3, diff --git a/libavutil/pixfmt.h b/libavutil/pixfmt.h index 9bf793866a..7068b43fed 100644 --- a/libavutil/pixfmt.h +++ b/libavutil/pixfmt.h @@ -145,6 +145,8 @@ enum PixelFormat { PIX_FMT_YUV444P9LE, ///< planar YUV 4:4:4, 27bpp, (1 Cr & Cb sample per 1x1 Y samples), little-endian PIX_FMT_YUV444P10BE,///< planar YUV 4:4:4, 30bpp, (1 Cr & Cb sample per 1x1 Y samples), big-endian PIX_FMT_YUV444P10LE,///< planar YUV 4:4:4, 30bpp, (1 Cr & Cb sample per 1x1 Y samples), little-endian + PIX_FMT_YUV422P9BE, ///< planar YUV 4:2:2, 18bpp, (1 Cr & Cb sample per 2x1 Y samples), big-endian + PIX_FMT_YUV422P9LE, ///< planar YUV 4:2:2, 18bpp, (1 Cr & Cb sample per 2x1 Y samples), little-endian PIX_FMT_NB, ///< number of pixel formats, DO NOT USE THIS if you want to link with shared libav* because the number of formats might differ between versions }; @@ -170,6 +172,7 @@ enum PixelFormat { #define PIX_FMT_BGR444 PIX_FMT_NE(BGR444BE, BGR444LE) #define PIX_FMT_YUV420P9 PIX_FMT_NE(YUV420P9BE , YUV420P9LE) +#define PIX_FMT_YUV422P9 PIX_FMT_NE(YUV422P9BE , YUV422P9LE) #define PIX_FMT_YUV444P9 PIX_FMT_NE(YUV444P9BE , YUV444P9LE) #define PIX_FMT_YUV420P10 PIX_FMT_NE(YUV420P10BE, YUV420P10LE) #define PIX_FMT_YUV422P10 PIX_FMT_NE(YUV422P10BE, YUV422P10LE) diff --git a/libswscale/swscale.c b/libswscale/swscale.c index 5d90250acf..227f65e301 100644 --- a/libswscale/swscale.c +++ b/libswscale/swscale.c @@ -2662,6 +2662,7 @@ static av_cold void sws_init_swScale_c(SwsContext *c) case PIX_FMT_RGB4_BYTE: c->chrToYV12 = palToUV_c; break; #if HAVE_BIGENDIAN case PIX_FMT_YUV444P9LE: + case PIX_FMT_YUV422P9LE: case PIX_FMT_YUV420P9LE: case PIX_FMT_YUV422P10LE: case PIX_FMT_YUV444P10LE: @@ -2671,6 +2672,7 @@ static av_cold void sws_init_swScale_c(SwsContext *c) case PIX_FMT_YUV444P16LE: c->chrToYV12 = bswap16UV_c; break; #else case PIX_FMT_YUV444P9BE: + case PIX_FMT_YUV422P9BE: case PIX_FMT_YUV420P9BE: case PIX_FMT_YUV444P10BE: case PIX_FMT_YUV422P10BE: @@ -2729,6 +2731,7 @@ static av_cold void sws_init_swScale_c(SwsContext *c) switch (srcFormat) { #if HAVE_BIGENDIAN case PIX_FMT_YUV444P9LE: + case PIX_FMT_YUV422P9LE: case PIX_FMT_YUV420P9LE: case PIX_FMT_YUV444P10LE: case PIX_FMT_YUV422P10LE: @@ -2739,6 +2742,7 @@ static av_cold void sws_init_swScale_c(SwsContext *c) case PIX_FMT_GRAY16LE: c->lumToYV12 = bswap16Y_c; break; #else case PIX_FMT_YUV444P9BE: + case PIX_FMT_YUV422P9BE: case PIX_FMT_YUV420P9BE: case PIX_FMT_YUV444P10BE: case PIX_FMT_YUV422P10BE: diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h index a13b89d203..9b895b125b 100644 --- a/libswscale/swscale_internal.h +++ b/libswscale/swscale_internal.h @@ -535,6 +535,8 @@ const char *sws_format_name(enum PixelFormat format); #define is9_OR_10BPS(x) ( \ (x)==PIX_FMT_YUV420P9LE \ || (x)==PIX_FMT_YUV420P9BE \ + || (x)==PIX_FMT_YUV422P9LE \ + || (x)==PIX_FMT_YUV422P9BE \ || (x)==PIX_FMT_YUV444P9BE \ || (x)==PIX_FMT_YUV444P9LE \ || (x)==PIX_FMT_YUV422P10BE \ @@ -559,6 +561,7 @@ const char *sws_format_name(enum PixelFormat format); #define isPlanarYUV(x) ( \ isPlanar8YUV(x) \ || (x)==PIX_FMT_YUV420P9LE \ + || (x)==PIX_FMT_YUV422P9LE \ || (x)==PIX_FMT_YUV444P9LE \ || (x)==PIX_FMT_YUV420P10LE \ || (x)==PIX_FMT_YUV422P10LE \ @@ -567,6 +570,7 @@ const char *sws_format_name(enum PixelFormat format); || (x)==PIX_FMT_YUV422P16LE \ || (x)==PIX_FMT_YUV444P16LE \ || (x)==PIX_FMT_YUV420P9BE \ + || (x)==PIX_FMT_YUV422P9BE \ || (x)==PIX_FMT_YUV444P9BE \ || (x)==PIX_FMT_YUV420P10BE \ || (x)==PIX_FMT_YUV422P10BE \ diff --git a/libswscale/utils.c b/libswscale/utils.c index ea5a1ab468..64ac77d7b7 100644 --- a/libswscale/utils.c +++ b/libswscale/utils.c @@ -129,6 +129,8 @@ const static FormatEntry format_entries[PIX_FMT_NB] = { [PIX_FMT_YUV420P9LE] = { 1 , 1 }, [PIX_FMT_YUV420P10BE] = { 1 , 1 }, [PIX_FMT_YUV420P10LE] = { 1 , 1 }, + [PIX_FMT_YUV422P9BE] = { 1 , 1 }, + [PIX_FMT_YUV422P9LE] = { 1 , 1 }, [PIX_FMT_YUV422P10BE] = { 1 , 1 }, [PIX_FMT_YUV422P10LE] = { 1 , 1 }, [PIX_FMT_YUV444P9BE] = { 1 , 1 }, diff --git a/tests/ref/lavfi/pixdesc b/tests/ref/lavfi/pixdesc index f27d31bb00..c4d789095c 100644 --- a/tests/ref/lavfi/pixdesc +++ b/tests/ref/lavfi/pixdesc @@ -42,6 +42,8 @@ yuv422p10be bdc13b630fd668b34c6fe1aae28dfc71 yuv422p10le d0607c260a45c973e6639f4e449730ad yuv422p16be 4e9b3b3467aeebb6a528cee5966800ed yuv422p16le f87c81bf16916b64d201359be0b4b6f4 +yuv422p9be 29b71579946940a8c00fa844c9dff507 +yuv422p9le 062b7f9cbb972bf36b5bdb1a7623701a yuv440p 5a064afe2b453bb52cdb3f176b1aa1cf yuv444p 0a98447b78fd476aa39686da6a74fa2e yuv444p10be e65cbae7e4f1892c23defbc8e8052cf6 diff --git a/tests/ref/lavfi/pixfmts_copy b/tests/ref/lavfi/pixfmts_copy index f27d31bb00..c4d789095c 100644 --- a/tests/ref/lavfi/pixfmts_copy +++ b/tests/ref/lavfi/pixfmts_copy @@ -42,6 +42,8 @@ yuv422p10be bdc13b630fd668b34c6fe1aae28dfc71 yuv422p10le d0607c260a45c973e6639f4e449730ad yuv422p16be 4e9b3b3467aeebb6a528cee5966800ed yuv422p16le f87c81bf16916b64d201359be0b4b6f4 +yuv422p9be 29b71579946940a8c00fa844c9dff507 +yuv422p9le 062b7f9cbb972bf36b5bdb1a7623701a yuv440p 5a064afe2b453bb52cdb3f176b1aa1cf yuv444p 0a98447b78fd476aa39686da6a74fa2e yuv444p10be e65cbae7e4f1892c23defbc8e8052cf6 diff --git a/tests/ref/lavfi/pixfmts_null b/tests/ref/lavfi/pixfmts_null index f27d31bb00..c4d789095c 100644 --- a/tests/ref/lavfi/pixfmts_null +++ b/tests/ref/lavfi/pixfmts_null @@ -42,6 +42,8 @@ yuv422p10be bdc13b630fd668b34c6fe1aae28dfc71 yuv422p10le d0607c260a45c973e6639f4e449730ad yuv422p16be 4e9b3b3467aeebb6a528cee5966800ed yuv422p16le f87c81bf16916b64d201359be0b4b6f4 +yuv422p9be 29b71579946940a8c00fa844c9dff507 +yuv422p9le 062b7f9cbb972bf36b5bdb1a7623701a yuv440p 5a064afe2b453bb52cdb3f176b1aa1cf yuv444p 0a98447b78fd476aa39686da6a74fa2e yuv444p10be e65cbae7e4f1892c23defbc8e8052cf6 diff --git a/tests/ref/lavfi/pixfmts_scale b/tests/ref/lavfi/pixfmts_scale index 4af1ca43de..83b523e8a3 100644 --- a/tests/ref/lavfi/pixfmts_scale +++ b/tests/ref/lavfi/pixfmts_scale @@ -42,6 +42,8 @@ yuv422p10be cea7ca6b0e66d6f29539885896c88603 yuv422p10le a10c4a5837547716f13cd61918b145f9 yuv422p16be 285993ee0c0f4f8e511ee46f93c5f38c yuv422p16le 61bfcee8e54465f760164f5a75d40b5e +yuv422p9be 82494823944912f73cebc58ad2979bbd +yuv422p9le fc69c8a21f473916a4b4225636b97e06 yuv440p 461503fdb9b90451020aa3b25ddf041c yuv444p 81b2eba962d12e8d64f003ac56f6faf2 yuv444p10be e9d3c8e744b8b0d8187ca092fa203fc9 diff --git a/tests/ref/lavfi/pixfmts_vflip b/tests/ref/lavfi/pixfmts_vflip index 21988f16d6..66d803cbdf 100644 --- a/tests/ref/lavfi/pixfmts_vflip +++ b/tests/ref/lavfi/pixfmts_vflip @@ -42,6 +42,8 @@ yuv422p10be 588fe319b96513c32e21d3e32b45447f yuv422p10le 11b57f2bd9661024153f3973b9090cdb yuv422p16be c092d083548c2a144c372a98c46875c7 yuv422p16le c071b9397a416d51cbe339345cbcba84 +yuv422p9be 7c6f1e140b3999ee7d923854e507752a +yuv422p9le 51f10d79c07989060dd06e767e6d7d60 yuv440p 876385e96165acf51271b20e5d85a416 yuv444p 9c3c667d1613b72d15bc6d851c5eb8f7 yuv444p10be 944a4997c4edb3a8dd0f0493cfd5a1fd From 76741b0e56bfbc74cfa32ff59e15cf420463569b Mon Sep 17 00:00:00 2001 From: Baptiste Coudurier Date: Tue, 16 Aug 2011 17:05:44 +0200 Subject: [PATCH 05/35] h264: 4:2:2 intra decoding support Signed-off-by: Diego Biurrun Signed-off-by: Ronald S. Bultje --- Changelog | 1 + libavcodec/arm/h264dsp_init_arm.c | 9 +- libavcodec/arm/h264pred_init_arm.c | 6 +- libavcodec/dsputil.h | 2 + libavcodec/h264.c | 97 ++++++++++++++----- libavcodec/h264.h | 12 +-- libavcodec/h264_cabac.c | 84 ++++++++++++++-- libavcodec/h264_cavlc.c | 127 ++++++++++++++++++++++-- libavcodec/h264_loopfilter.c | 61 +++++++++--- libavcodec/h264_mvpred.h | 7 +- libavcodec/h264_ps.c | 5 +- libavcodec/h264data.h | 9 +- libavcodec/h264dsp.c | 38 ++++++-- libavcodec/h264dsp.h | 8 +- libavcodec/h264dsp_template.c | 16 ++++ libavcodec/h264idct_template.c | 50 ++++++++++ libavcodec/h264pred.c | 51 +++++++--- libavcodec/h264pred.h | 6 +- libavcodec/h264pred_template.c | 138 +++++++++++++++++++++++++++ libavcodec/ppc/h264_altivec.c | 5 +- libavcodec/rv34.c | 2 +- libavcodec/vp8.c | 2 +- libavcodec/x86/h264_intrapred_init.c | 47 +++++---- libavcodec/x86/h264dsp_mmx.c | 23 +++-- 24 files changed, 673 insertions(+), 133 deletions(-) diff --git a/Changelog b/Changelog index 3041632a33..491f93b79c 100644 --- a/Changelog +++ b/Changelog @@ -54,6 +54,7 @@ easier to use. The changes are: - boxblur filter - Ut Video decoder - Speex encoding via libspeex +- 4:2:2 H.264 decoding support version 0.7: diff --git a/libavcodec/arm/h264dsp_init_arm.c b/libavcodec/arm/h264dsp_init_arm.c index c2399e50ff..c1ca217add 100644 --- a/libavcodec/arm/h264dsp_init_arm.c +++ b/libavcodec/arm/h264dsp_init_arm.c @@ -92,7 +92,7 @@ void ff_h264_idct8_add4_neon(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]); -static void ff_h264dsp_init_neon(H264DSPContext *c, const int bit_depth) +static void ff_h264dsp_init_neon(H264DSPContext *c, const int bit_depth, const int chroma_format_idc) { if (bit_depth == 8) { c->h264_v_loop_filter_luma = ff_h264_v_loop_filter_luma_neon; @@ -122,14 +122,15 @@ static void ff_h264dsp_init_neon(H264DSPContext *c, const int bit_depth) c->h264_idct_dc_add = ff_h264_idct_dc_add_neon; c->h264_idct_add16 = ff_h264_idct_add16_neon; c->h264_idct_add16intra = ff_h264_idct_add16intra_neon; - c->h264_idct_add8 = ff_h264_idct_add8_neon; + if (chroma_format_idc == 1) + c->h264_idct_add8 = ff_h264_idct_add8_neon; c->h264_idct8_add = ff_h264_idct8_add_neon; c->h264_idct8_dc_add = ff_h264_idct8_dc_add_neon; c->h264_idct8_add4 = ff_h264_idct8_add4_neon; } } -void ff_h264dsp_init_arm(H264DSPContext *c, const int bit_depth) +void ff_h264dsp_init_arm(H264DSPContext *c, const int bit_depth, const int chroma_format_idc) { - if (HAVE_NEON) ff_h264dsp_init_neon(c, bit_depth); + if (HAVE_NEON) ff_h264dsp_init_neon(c, bit_depth, chroma_format_idc); } diff --git a/libavcodec/arm/h264pred_init_arm.c b/libavcodec/arm/h264pred_init_arm.c index e96f339a55..5fc07bc137 100644 --- a/libavcodec/arm/h264pred_init_arm.c +++ b/libavcodec/arm/h264pred_init_arm.c @@ -42,7 +42,7 @@ void ff_pred8x8_0lt_dc_neon(uint8_t *src, int stride); void ff_pred8x8_l00_dc_neon(uint8_t *src, int stride); void ff_pred8x8_0l0_dc_neon(uint8_t *src, int stride); -static void ff_h264_pred_init_neon(H264PredContext *h, int codec_id, const int bit_depth) +static void ff_h264_pred_init_neon(H264PredContext *h, int codec_id, const int bit_depth, const int chroma_format_idc) { const int high_depth = bit_depth > 8; @@ -74,7 +74,7 @@ static void ff_h264_pred_init_neon(H264PredContext *h, int codec_id, const int b h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_plane_neon; } -void ff_h264_pred_init_arm(H264PredContext *h, int codec_id, int bit_depth) +void ff_h264_pred_init_arm(H264PredContext *h, int codec_id, int bit_depth, const int chroma_format_idc) { - if (HAVE_NEON) ff_h264_pred_init_neon(h, codec_id, bit_depth); + if (HAVE_NEON) ff_h264_pred_init_neon(h, codec_id, bit_depth, chroma_format_idc); } diff --git a/libavcodec/dsputil.h b/libavcodec/dsputil.h index bef2cdd4e8..acb2041460 100644 --- a/libavcodec/dsputil.h +++ b/libavcodec/dsputil.h @@ -63,8 +63,10 @@ void ff_h264_idct_dc_add_ ## depth ## _c(uint8_t *dst, DCTELEM *block, int strid void ff_h264_idct_add16_ ## depth ## _c(uint8_t *dst, const int *blockoffset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]);\ void ff_h264_idct_add16intra_ ## depth ## _c(uint8_t *dst, const int *blockoffset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]);\ void ff_h264_idct8_add4_ ## depth ## _c(uint8_t *dst, const int *blockoffset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]);\ +void ff_h264_idct_add8_422_ ## depth ## _c(uint8_t **dest, const int *blockoffset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]);\ void ff_h264_idct_add8_ ## depth ## _c(uint8_t **dest, const int *blockoffset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]);\ void ff_h264_luma_dc_dequant_idct_ ## depth ## _c(DCTELEM *output, DCTELEM *input, int qmul);\ +void ff_h264_chroma422_dc_dequant_idct_ ## depth ## _c(DCTELEM *block, int qmul);\ void ff_h264_chroma_dc_dequant_idct_ ## depth ## _c(DCTELEM *block, int qmul); H264_IDCT( 8) diff --git a/libavcodec/h264.c b/libavcodec/h264.c index 1faaaa6802..f61f524508 100644 --- a/libavcodec/h264.c +++ b/libavcodec/h264.c @@ -942,7 +942,7 @@ static void clone_tables(H264Context *dst, H264Context *src, int i){ dst->list_counts = src->list_counts; dst->s.obmc_scratchpad = NULL; - ff_h264_pred_init(&dst->hpc, src->s.codec_id, src->sps.bit_depth_luma); + ff_h264_pred_init(&dst->hpc, src->s.codec_id, src->sps.bit_depth_luma, src->sps.chroma_format_idc); } /** @@ -970,8 +970,8 @@ static av_cold void common_init(H264Context *h){ s->height = s->avctx->height; s->codec_id= s->avctx->codec->id; - ff_h264dsp_init(&h->h264dsp, 8); - ff_h264_pred_init(&h->hpc, s->codec_id, 8); + ff_h264dsp_init(&h->h264dsp, 8, 1); + ff_h264_pred_init(&h->hpc, s->codec_id, 8, 1); h->dequant_coeff_pps= -1; s->unrestricted_mv=1; @@ -1432,11 +1432,16 @@ static void decode_postinit(H264Context *h, int setup_finished){ ff_thread_finish_setup(s->avctx); } -static av_always_inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int chroma444, int simple){ +static av_always_inline void backup_mb_border(H264Context *h, uint8_t *src_y, + uint8_t *src_cb, uint8_t *src_cr, + int linesize, int uvlinesize, int simple) +{ MpegEncContext * const s = &h->s; uint8_t *top_border; int top_idx = 1; const int pixel_shift = h->pixel_shift; + int chroma444 = CHROMA444; + int chroma422 = CHROMA422; src_y -= linesize; src_cb -= uvlinesize; @@ -1460,6 +1465,14 @@ static av_always_inline void backup_mb_border(H264Context *h, uint8_t *src_y, ui AV_COPY128(top_border+16, src_cb + 15*uvlinesize); AV_COPY128(top_border+32, src_cr + 15*uvlinesize); } + } else if(chroma422) { + if (pixel_shift) { + AV_COPY128(top_border+32, src_cb + 15*uvlinesize); + AV_COPY128(top_border+48, src_cr + 15*uvlinesize); + } else { + AV_COPY64(top_border+16, src_cb + 15*uvlinesize); + AV_COPY64(top_border+24, src_cr + 15*uvlinesize); + } } else { if (pixel_shift) { AV_COPY128(top_border+32, src_cb+7*uvlinesize); @@ -1495,6 +1508,14 @@ static av_always_inline void backup_mb_border(H264Context *h, uint8_t *src_y, ui AV_COPY128(top_border+16, src_cb + 16*linesize); AV_COPY128(top_border+32, src_cr + 16*linesize); } + } else if(chroma422) { + if (pixel_shift) { + AV_COPY128(top_border+32, src_cb+16*uvlinesize); + AV_COPY128(top_border+48, src_cr+16*uvlinesize); + } else { + AV_COPY64(top_border+16, src_cb+16*uvlinesize); + AV_COPY64(top_border+24, src_cr+16*uvlinesize); + } } else { if (pixel_shift) { AV_COPY128(top_border+32, src_cb+8*uvlinesize); @@ -1773,10 +1794,11 @@ static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple, i /* is_h264 should always be true if SVQ3 is disabled. */ const int is_h264 = !CONFIG_SVQ3_DECODER || simple || s->codec_id == CODEC_ID_H264; void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride); + const int block_h = 16 >> s->chroma_y_shift; dest_y = s->current_picture.f.data[0] + ((mb_x << pixel_shift) + mb_y * s->linesize ) * 16; - dest_cb = s->current_picture.f.data[1] + ((mb_x << pixel_shift) + mb_y * s->uvlinesize) * 8; - dest_cr = s->current_picture.f.data[2] + ((mb_x << pixel_shift) + mb_y * s->uvlinesize) * 8; + dest_cb = s->current_picture.f.data[1] + (mb_x << pixel_shift)*8 + mb_y * s->uvlinesize * block_h; + dest_cr = s->current_picture.f.data[2] + (mb_x << pixel_shift)*8 + mb_y * s->uvlinesize * block_h; s->dsp.prefetch(dest_y + (s->mb_x&3)*4*s->linesize + (64 << pixel_shift), s->linesize, 4); s->dsp.prefetch(dest_cb + (s->mb_x&7)*s->uvlinesize + (64 << pixel_shift), dest_cr - dest_cb, 2); @@ -1789,8 +1811,8 @@ static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple, i block_offset = &h->block_offset[48]; if(mb_y&1){ //FIXME move out of this function? dest_y -= s->linesize*15; - dest_cb-= s->uvlinesize*7; - dest_cr-= s->uvlinesize*7; + dest_cb-= s->uvlinesize * (block_h - 1); + dest_cr-= s->uvlinesize * (block_h - 1); } if(FRAME_MBAFF) { int list; @@ -1842,12 +1864,12 @@ static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple, i } } } else { - for (i = 0; i < 8; i++) { + for (i = 0; i < block_h; i++) { uint16_t *tmp_cb = (uint16_t*)(dest_cb + i*uvlinesize); for (j = 0; j < 8; j++) tmp_cb[j] = get_bits(&gb, bit_depth); } - for (i = 0; i < 8; i++) { + for (i = 0; i < block_h; i++) { uint16_t *tmp_cr = (uint16_t*)(dest_cr + i*uvlinesize); for (j = 0; j < 8; j++) tmp_cr[j] = get_bits(&gb, bit_depth); @@ -1865,7 +1887,7 @@ static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple, i memset(dest_cr + i*uvlinesize, 128, 8); } } else { - for (i = 0; i < 8; i++) { + for (i = 0; i < block_h; i++) { memcpy(dest_cb + i*uvlinesize, h->mb + 128 + i*4, 8); memcpy(dest_cr + i*uvlinesize, h->mb + 160 + i*4, 8); } @@ -1913,10 +1935,18 @@ static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple, i } }else{ if(is_h264){ + int qp[2]; + if (CHROMA422) { + qp[0] = h->chroma_qp[0] + 3; + qp[1] = h->chroma_qp[1] + 3; + } else { + qp[0] = h->chroma_qp[0]; + qp[1] = h->chroma_qp[1]; + } if(h->non_zero_count_cache[ scan8[CHROMA_DC_BLOCK_INDEX+0] ]) - h->h264dsp.h264_chroma_dc_dequant_idct(h->mb + (16*16*1 << pixel_shift), h->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][h->chroma_qp[0]][0]); + h->h264dsp.h264_chroma_dc_dequant_idct(h->mb + (16*16*1 << pixel_shift), h->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][qp[0]][0]); if(h->non_zero_count_cache[ scan8[CHROMA_DC_BLOCK_INDEX+1] ]) - h->h264dsp.h264_chroma_dc_dequant_idct(h->mb + (16*16*2 << pixel_shift), h->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][h->chroma_qp[1]][0]); + h->h264dsp.h264_chroma_dc_dequant_idct(h->mb + (16*16*2 << pixel_shift), h->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][qp[1]][0]); h->h264dsp.h264_idct_add8(dest, block_offset, h->mb, uvlinesize, h->non_zero_count_cache); @@ -2555,11 +2585,13 @@ static int decode_slice_header(H264Context *h, H264Context *h0){ h->b_stride= s->mb_width*4; + s->chroma_y_shift = h->sps.chroma_format_idc <= 1; // 400 uses yuv420p + s->width = 16*s->mb_width - (2>>CHROMA444)*FFMIN(h->sps.crop_right, (8<sps.frame_mbs_only_flag) - s->height= 16*s->mb_height - (2>>CHROMA444)*FFMIN(h->sps.crop_bottom, (8<height= 16*s->mb_height - (1<chroma_y_shift)*FFMIN(h->sps.crop_bottom, (16>>s->chroma_y_shift)-1); else - s->height= 16*s->mb_height - (4>>CHROMA444)*FFMIN(h->sps.crop_bottom, (8<height= 16*s->mb_height - (2<chroma_y_shift)*FFMIN(h->sps.crop_bottom, (16>>s->chroma_y_shift)-1); if (s->context_initialized && ( s->width != s->avctx->width || s->height != s->avctx->height @@ -2601,14 +2633,26 @@ static int decode_slice_header(H264Context *h, H264Context *h0){ switch (h->sps.bit_depth_luma) { case 9 : - s->avctx->pix_fmt = CHROMA444 ? PIX_FMT_YUV444P9 : PIX_FMT_YUV420P9; + if (CHROMA444) + s->avctx->pix_fmt = PIX_FMT_YUV444P9; + else if (CHROMA422) + s->avctx->pix_fmt = PIX_FMT_YUV422P9; + else + s->avctx->pix_fmt = PIX_FMT_YUV420P9; break; case 10 : - s->avctx->pix_fmt = CHROMA444 ? PIX_FMT_YUV444P10 : PIX_FMT_YUV420P10; + if (CHROMA444) + s->avctx->pix_fmt = PIX_FMT_YUV444P10; + else if (CHROMA422) + s->avctx->pix_fmt = PIX_FMT_YUV422P10; + else + s->avctx->pix_fmt = PIX_FMT_YUV420P10; break; default: if (CHROMA444){ s->avctx->pix_fmt = s->avctx->color_range == AVCOL_RANGE_JPEG ? PIX_FMT_YUVJ444P : PIX_FMT_YUV444P; + } else if (CHROMA422) { + s->avctx->pix_fmt = s->avctx->color_range == AVCOL_RANGE_JPEG ? PIX_FMT_YUVJ422P : PIX_FMT_YUV422P; }else{ s->avctx->pix_fmt = s->avctx->get_format(s->avctx, s->avctx->codec->pix_fmts ? @@ -3272,6 +3316,7 @@ static void loop_filter(H264Context *h, int start_x, int end_x){ const int end_mb_y= s->mb_y + FRAME_MBAFF; const int old_slice_type= h->slice_type; const int pixel_shift = h->pixel_shift; + const int block_h = 16 >> s->chroma_y_shift; if(h->deblocking_filter) { for(mb_x= start_x; mb_xmb_x= mb_x; s->mb_y= mb_y; dest_y = s->current_picture.f.data[0] + ((mb_x << pixel_shift) + mb_y * s->linesize ) * 16; - dest_cb = s->current_picture.f.data[1] + ((mb_x << pixel_shift) + mb_y * s->uvlinesize) * (8 << CHROMA444); - dest_cr = s->current_picture.f.data[2] + ((mb_x << pixel_shift) + mb_y * s->uvlinesize) * (8 << CHROMA444); + dest_cb = s->current_picture.f.data[1] + (mb_x << pixel_shift) * (8 << CHROMA444) + mb_y * s->uvlinesize * block_h; + dest_cr = s->current_picture.f.data[2] + (mb_x << pixel_shift) * (8 << CHROMA444) + mb_y * s->uvlinesize * block_h; //FIXME simplify above if (MB_FIELD) { @@ -3297,14 +3342,14 @@ static void loop_filter(H264Context *h, int start_x, int end_x){ uvlinesize = h->mb_uvlinesize = s->uvlinesize * 2; if(mb_y&1){ //FIXME move out of this function? dest_y -= s->linesize*15; - dest_cb-= s->uvlinesize*((8 << CHROMA444)-1); - dest_cr-= s->uvlinesize*((8 << CHROMA444)-1); + dest_cb-= s->uvlinesize * (block_h - 1); + dest_cr-= s->uvlinesize * (block_h - 1); } } else { linesize = h->mb_linesize = s->linesize; uvlinesize = h->mb_uvlinesize = s->uvlinesize; } - backup_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, CHROMA444, 0); + backup_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0); if(fill_filter_caches(h, mb_type)) continue; h->chroma_qp[0] = get_chroma_qp(h, 0, s->current_picture.f.qscale_table[mb_xy]); @@ -3742,13 +3787,15 @@ static int decode_nal_units(H264Context *h, const uint8_t *buf, int buf_size){ if(avctx->has_b_frames < 2) avctx->has_b_frames= !s->low_delay; - if (avctx->bits_per_raw_sample != h->sps.bit_depth_luma) { + if (avctx->bits_per_raw_sample != h->sps.bit_depth_luma || + h->cur_chroma_format_idc != h->sps.chroma_format_idc) { if (h->sps.bit_depth_luma >= 8 && h->sps.bit_depth_luma <= 10) { avctx->bits_per_raw_sample = h->sps.bit_depth_luma; + h->cur_chroma_format_idc = h->sps.chroma_format_idc; h->pixel_shift = h->sps.bit_depth_luma > 8; - ff_h264dsp_init(&h->h264dsp, h->sps.bit_depth_luma); - ff_h264_pred_init(&h->hpc, s->codec_id, h->sps.bit_depth_luma); + ff_h264dsp_init(&h->h264dsp, h->sps.bit_depth_luma, h->sps.chroma_format_idc); + ff_h264_pred_init(&h->hpc, s->codec_id, h->sps.bit_depth_luma, h->sps.chroma_format_idc); s->dsp.dct_bits = h->sps.bit_depth_luma > 8 ? 32 : 16; dsputil_init(&s->dsp, s->avctx); } else { diff --git a/libavcodec/h264.h b/libavcodec/h264.h index 122a54aca0..bd2b5d8fe5 100644 --- a/libavcodec/h264.h +++ b/libavcodec/h264.h @@ -39,13 +39,6 @@ #define interlaced_dct interlaced_dct_is_a_bad_name #define mb_intra mb_intra_is_not_initialized_see_mb_type -#define CHROMA_DC_COEFF_TOKEN_VLC_BITS 8 -#define COEFF_TOKEN_VLC_BITS 8 -#define TOTAL_ZEROS_VLC_BITS 9 -#define CHROMA_DC_TOTAL_ZEROS_VLC_BITS 3 -#define RUN_VLC_BITS 3 -#define RUN7_VLC_BITS 6 - #define MAX_SPS_COUNT 32 #define MAX_PPS_COUNT 256 @@ -92,6 +85,7 @@ #define CABAC h->pps.cabac #endif +#define CHROMA422 (h->sps.chroma_format_idc == 2) #define CHROMA444 (h->sps.chroma_format_idc == 3) #define EXTENDED_SAR 255 @@ -582,6 +576,8 @@ typedef struct H264Context{ // Timestamp stuff int sei_buffering_period_present; ///< Buffering period SEI flag int initial_cpb_removal_delay[32]; ///< Initial timestamps for CPBs + + int cur_chroma_format_idc; }H264Context; @@ -809,7 +805,7 @@ static av_always_inline void write_back_non_zero_count(H264Context *h){ AV_COPY32(&nnz[32], &nnz_cache[4+8*11]); AV_COPY32(&nnz[36], &nnz_cache[4+8*12]); - if(CHROMA444){ + if(!h->s.chroma_y_shift){ AV_COPY32(&nnz[24], &nnz_cache[4+8* 8]); AV_COPY32(&nnz[28], &nnz_cache[4+8* 9]); AV_COPY32(&nnz[40], &nnz_cache[4+8*13]); diff --git a/libavcodec/h264_cabac.c b/libavcodec/h264_cabac.c index 065b6e85e1..0325ea456f 100644 --- a/libavcodec/h264_cabac.c +++ b/libavcodec/h264_cabac.c @@ -1565,7 +1565,12 @@ DECLARE_ASM_CONST(1, uint8_t, last_coeff_flag_offset_8x8)[63] = { 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8 }; -static av_always_inline void decode_cabac_residual_internal( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff, int is_dc ) { +static av_always_inline void +decode_cabac_residual_internal(H264Context *h, DCTELEM *block, + int cat, int n, const uint8_t *scantable, + const uint32_t *qmul, int max_coeff, + int is_dc, int chroma422) +{ static const int significant_coeff_flag_offset[2][14] = { { 105+0, 105+15, 105+29, 105+44, 105+47, 402, 484+0, 484+15, 484+29, 660, 528+0, 528+15, 528+29, 718 }, { 277+0, 277+15, 277+29, 277+44, 277+47, 436, 776+0, 776+15, 776+29, 675, 820+0, 820+15, 820+29, 733 } @@ -1587,12 +1592,16 @@ static av_always_inline void decode_cabac_residual_internal( H264Context *h, DCT 9, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,13,13, 9, 9,10,10, 8,13,13, 9, 9,10,10,14,14,14,14,14 } }; + static const uint8_t sig_coeff_offset_dc[7] = { 0, 0, 1, 1, 2, 2, 2 }; /* node ctx: 0..3: abslevel1 (with abslevelgt1 == 0). * 4..7: abslevelgt1 + 3 (and abslevel1 doesn't matter). * map node ctx => cabac ctx for level=1 */ static const uint8_t coeff_abs_level1_ctx[8] = { 1, 2, 3, 4, 0, 0, 0, 0 }; /* map node ctx => cabac ctx for level>1 */ - static const uint8_t coeff_abs_levelgt1_ctx[8] = { 5, 5, 5, 5, 6, 7, 8, 9 }; + static const uint8_t coeff_abs_levelgt1_ctx[2][8] = { + { 5, 5, 5, 5, 6, 7, 8, 9 }, + { 5, 5, 5, 5, 6, 7, 8, 8 }, // 422/dc case + }; static const uint8_t coeff_abs_level_transition[2][8] = { /* update node ctx after decoding a level=1 */ { 1, 2, 3, 3, 4, 5, 6, 7 }, @@ -1651,12 +1660,20 @@ static av_always_inline void decode_cabac_residual_internal( H264Context *h, DCT coeff_count= decode_significance_8x8_x86(CC, significant_coeff_ctx_base, index, last_coeff_ctx_base, sig_off); } else { - coeff_count= decode_significance_x86(CC, max_coeff, significant_coeff_ctx_base, index, - last_coeff_ctx_base-significant_coeff_ctx_base); + if (is_dc && chroma422) { // dc 422 + DECODE_SIGNIFICANCE(7, sig_coeff_offset_dc[last], sig_coeff_offset_dc[last]); + } else { + coeff_count= decode_significance_x86(CC, max_coeff, significant_coeff_ctx_base, index, + last_coeff_ctx_base-significant_coeff_ctx_base); + } #else DECODE_SIGNIFICANCE( 63, sig_off[last], last_coeff_flag_offset_8x8[last] ); } else { - DECODE_SIGNIFICANCE( max_coeff - 1, last, last ); + if (is_dc && chroma422) { // dc 422 + DECODE_SIGNIFICANCE(7, sig_coeff_offset_dc[last], sig_coeff_offset_dc[last]); + } else { + DECODE_SIGNIFICANCE(max_coeff - 1, last, last); + } #endif } assert(coeff_count > 0); @@ -1691,7 +1708,7 @@ static av_always_inline void decode_cabac_residual_internal( H264Context *h, DCT } \ } else { \ int coeff_abs = 2; \ - ctx = coeff_abs_levelgt1_ctx[node_ctx] + abs_level_m1_ctx_base; \ + ctx = coeff_abs_levelgt1_ctx[is_dc && chroma422][node_ctx] + abs_level_m1_ctx_base; \ node_ctx = coeff_abs_level_transition[1][node_ctx]; \ \ while( coeff_abs < 15 && get_cabac( CC, ctx ) ) { \ @@ -1733,11 +1750,18 @@ static av_always_inline void decode_cabac_residual_internal( H264Context *h, DCT } static void decode_cabac_residual_dc_internal( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, int max_coeff ) { - decode_cabac_residual_internal(h, block, cat, n, scantable, NULL, max_coeff, 1); + decode_cabac_residual_internal(h, block, cat, n, scantable, NULL, max_coeff, 1, 0); +} + +static void decode_cabac_residual_dc_internal_422(H264Context *h, DCTELEM *block, + int cat, int n, const uint8_t *scantable, + int max_coeff) +{ + decode_cabac_residual_internal(h, block, cat, n, scantable, NULL, max_coeff, 1, 1); } static void decode_cabac_residual_nondc_internal( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) { - decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, 0); + decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, 0, 0); } /* cat: 0-> DC 16x16 n = 0 @@ -1761,6 +1785,19 @@ static av_always_inline void decode_cabac_residual_dc( H264Context *h, DCTELEM * decode_cabac_residual_dc_internal( h, block, cat, n, scantable, max_coeff ); } +static av_always_inline void +decode_cabac_residual_dc_422(H264Context *h, DCTELEM *block, + int cat, int n, const uint8_t *scantable, + int max_coeff) +{ + /* read coded block flag */ + if (get_cabac(&h->cabac, &h->cabac_state[get_cabac_cbf_ctx(h, cat, n, max_coeff, 1)]) == 0) { + h->non_zero_count_cache[scan8[n]] = 0; + return; + } + decode_cabac_residual_dc_internal_422(h, block, cat, n, scantable, max_coeff); +} + static av_always_inline void decode_cabac_residual_nondc( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) { /* read coded block flag */ if( (cat != 5 || CHROMA444) && get_cabac( &h->cabac, &h->cabac_state[get_cabac_cbf_ctx( h, cat, n, max_coeff, 0 ) ] ) == 0 ) { @@ -2313,7 +2350,36 @@ decode_intra_mb: if(CHROMA444){ decode_cabac_luma_residual(h, scan, scan8x8, pixel_shift, mb_type, cbp, 1); decode_cabac_luma_residual(h, scan, scan8x8, pixel_shift, mb_type, cbp, 2); - } else { + } else if (CHROMA422) { + if( cbp&0x30 ){ + int c; + for( c = 0; c < 2; c++ ) { + //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-DC\n",c ); + decode_cabac_residual_dc_422(h, h->mb + ((256 + 16*16*c) << pixel_shift), 3, + CHROMA_DC_BLOCK_INDEX + c, + chroma422_dc_scan, 8); + } + } + + if( cbp&0x20 ) { + int c, i, i8x8; + for( c = 0; c < 2; c++ ) { + DCTELEM *mb = h->mb + (16*(16 + 16*c) << pixel_shift); + qmul = h->dequant4_coeff[c+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[c]]; + for (i8x8 = 0; i8x8 < 2; i8x8++) { + for (i = 0; i < 4; i++) { + const int index = 16 + 16 * c + 8*i8x8 + i; + //av_log(s->avctx, AV_LOG_ERROR, "INTRA C%d-AC %d\n",c, index - 16); + decode_cabac_residual_nondc(h, mb, 4, index, scan + 1, qmul, 15); + mb += 16<non_zero_count_cache[scan8[16]], 4, 4, 8, 0, 1); + fill_rectangle(&h->non_zero_count_cache[scan8[32]], 4, 4, 8, 0, 1); + } + } else /* yuv420 */ { if( cbp&0x30 ){ int c; for( c = 0; c < 2; c++ ) { diff --git a/libavcodec/h264_cavlc.c b/libavcodec/h264_cavlc.c index ca7b9399d3..b94b51b9a1 100644 --- a/libavcodec/h264_cavlc.c +++ b/libavcodec/h264_cavlc.c @@ -62,6 +62,30 @@ static const uint8_t chroma_dc_coeff_token_bits[4*5]={ 2, 3, 2, 0, }; +static const uint8_t chroma422_dc_coeff_token_len[4*9]={ + 1, 0, 0, 0, + 7, 2, 0, 0, + 7, 7, 3, 0, + 9, 7, 7, 5, + 9, 9, 7, 6, + 10, 10, 9, 7, + 11, 11, 10, 7, + 12, 12, 11, 10, + 13, 12, 12, 11, +}; + +static const uint8_t chroma422_dc_coeff_token_bits[4*9]={ + 1, 0, 0, 0, + 15, 1, 0, 0, + 14, 13, 1, 0, + 7, 12, 11, 1, + 6, 5, 10, 1, + 7, 6, 4, 9, + 7, 6, 5, 8, + 7, 6, 5, 4, + 7, 5, 4, 4, +}; + static const uint8_t coeff_token_len[4][4*17]={ { 1, 0, 0, 0, @@ -172,6 +196,26 @@ static const uint8_t chroma_dc_total_zeros_bits[3][4]= { { 1, 0, 0, 0,}, }; +static const uint8_t chroma422_dc_total_zeros_len[7][8]= { + { 1, 3, 3, 4, 4, 4, 5, 5 }, + { 3, 2, 3, 3, 3, 3, 3 }, + { 3, 3, 2, 2, 3, 3 }, + { 3, 2, 2, 2, 3 }, + { 2, 2, 2, 2 }, + { 2, 2, 1 }, + { 1, 1 }, +}; + +static const uint8_t chroma422_dc_total_zeros_bits[7][8]= { + { 1, 2, 3, 2, 3, 1, 1, 0 }, + { 0, 1, 1, 4, 5, 6, 7 }, + { 0, 1, 1, 2, 6, 7 }, + { 6, 0, 1, 2, 7 }, + { 0, 1, 2, 3 }, + { 0, 1, 1 }, + { 0, 1 }, +}; + static const uint8_t run_len[7][16]={ {1,1}, {1,2,2}, @@ -200,6 +244,10 @@ static VLC chroma_dc_coeff_token_vlc; static VLC_TYPE chroma_dc_coeff_token_vlc_table[256][2]; static const int chroma_dc_coeff_token_vlc_table_size = 256; +static VLC chroma422_dc_coeff_token_vlc; +static VLC_TYPE chroma422_dc_coeff_token_vlc_table[8192][2]; +static const int chroma422_dc_coeff_token_vlc_table_size = 8192; + static VLC total_zeros_vlc[15]; static VLC_TYPE total_zeros_vlc_tables[15][512][2]; static const int total_zeros_vlc_tables_size = 512; @@ -208,6 +256,10 @@ static VLC chroma_dc_total_zeros_vlc[3]; static VLC_TYPE chroma_dc_total_zeros_vlc_tables[3][8][2]; static const int chroma_dc_total_zeros_vlc_tables_size = 8; +static VLC chroma422_dc_total_zeros_vlc[7]; +static VLC_TYPE chroma422_dc_total_zeros_vlc_tables[7][32][2]; +static const int chroma422_dc_total_zeros_vlc_tables_size = 32; + static VLC run_vlc[6]; static VLC_TYPE run_vlc_tables[6][8][2]; static const int run_vlc_tables_size = 8; @@ -219,6 +271,14 @@ static const int run7_vlc_table_size = 96; #define LEVEL_TAB_BITS 8 static int8_t cavlc_level_tab[7][1<>2; }else{ if(n >= LUMA_DC_BLOCK_INDEX){ @@ -483,11 +564,16 @@ static int decode_residual(H264Context *h, GetBitContext *gb, DCTELEM *block, in if(total_coeff == max_coeff) zeros_left=0; else{ - /* FIXME: we don't actually support 4:2:2 yet. */ - if(max_coeff <= 8) - zeros_left= get_vlc2(gb, (chroma_dc_total_zeros_vlc-1)[ total_coeff ].table, CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 1); - else + if (max_coeff <= 8) { + if (max_coeff == 4) + zeros_left = get_vlc2(gb, (chroma_dc_total_zeros_vlc-1)[total_coeff].table, + CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 1); + else + zeros_left = get_vlc2(gb, (chroma422_dc_total_zeros_vlc-1)[total_coeff].table, + CHROMA422_DC_TOTAL_ZEROS_VLC_BITS, 1); + } else { zeros_left= get_vlc2(gb, (total_zeros_vlc-1)[ total_coeff ].table, TOTAL_ZEROS_VLC_BITS, 1); + } } #define STORE_BLOCK(type) \ @@ -994,7 +1080,7 @@ decode_intra_mb: s->current_picture.f.mb_type[mb_xy] = mb_type; if(cbp || IS_INTRA16x16(mb_type)){ - int i4x4, chroma_idx; + int i4x4, i8x8, chroma_idx; int dquant; int ret; GetBitContext *gb= IS_INTRA(mb_type) ? h->intra_gb_ptr : h->inter_gb_ptr; @@ -1036,7 +1122,34 @@ decode_intra_mb: if( decode_luma_residual(h, gb, scan, scan8x8, pixel_shift, mb_type, cbp, 2) < 0 ){ return -1; } - } else { + } else if (CHROMA422) { + if(cbp&0x30){ + for(chroma_idx=0; chroma_idx<2; chroma_idx++) + if (decode_residual(h, gb, h->mb + ((256 + 16*16*chroma_idx) << pixel_shift), + CHROMA_DC_BLOCK_INDEX+chroma_idx, chroma422_dc_scan, + NULL, 8) < 0) { + return -1; + } + } + + if(cbp&0x20){ + for(chroma_idx=0; chroma_idx<2; chroma_idx++){ + const uint32_t *qmul = h->dequant4_coeff[chroma_idx+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[chroma_idx]]; + DCTELEM *mb = h->mb + (16*(16 + 16*chroma_idx) << pixel_shift); + for (i8x8 = 0; i8x8 < 2; i8x8++) { + for (i4x4 = 0; i4x4 < 4; i4x4++) { + const int index = 16 + 16*chroma_idx + 8*i8x8 + i4x4; + if (decode_residual(h, gb, mb, index, scan + 1, qmul, 15) < 0) + return -1; + mb += 16 << pixel_shift; + } + } + } + }else{ + fill_rectangle(&h->non_zero_count_cache[scan8[16]], 4, 4, 8, 0, 1); + fill_rectangle(&h->non_zero_count_cache[scan8[32]], 4, 4, 8, 0, 1); + } + } else /* yuv420 */ { if(cbp&0x30){ for(chroma_idx=0; chroma_idx<2; chroma_idx++) if( decode_residual(h, gb, h->mb + ((256 + 16*16*chroma_idx) << pixel_shift), CHROMA_DC_BLOCK_INDEX+chroma_idx, chroma_dc_scan, NULL, 4) < 0){ diff --git a/libavcodec/h264_loopfilter.c b/libavcodec/h264_loopfilter.c index 377968fcd2..64b07e91f0 100644 --- a/libavcodec/h264_loopfilter.c +++ b/libavcodec/h264_loopfilter.c @@ -212,6 +212,7 @@ static void av_always_inline h264_filter_mb_fast_internal( H264Context *h, int m MpegEncContext * const s = &h->s; int chroma = !(CONFIG_GRAY && (s->flags&CODEC_FLAG_GRAY)); int chroma444 = CHROMA444; + int chroma422 = CHROMA422; int mb_xy = h->mb_xy; int left_type= h->left_type[LTOP]; @@ -289,6 +290,23 @@ static void av_always_inline h264_filter_mb_fast_internal( H264Context *h, int m filter_mb_edgeh( &img_cb[4*3*linesize], linesize, bS3, qpc, a, b, h, 0); filter_mb_edgeh( &img_cr[4*3*linesize], linesize, bS3, qpc, a, b, h, 0); } + }else if(chroma422){ + if(left_type){ + filter_mb_edgecv(&img_cb[2*0<s; int edge; int chroma_qp_avg[2]; + int chroma444 = CHROMA444; + int chroma422 = CHROMA422; const int mbm_xy = dir == 0 ? mb_xy -1 : h->top_mb_xy; const int mbm_type = dir == 0 ? h->left_type[LTOP] : h->top_type; @@ -564,8 +584,9 @@ static av_always_inline void filter_mb_dir(H264Context *h, int mb_x, int mb_y, u for( edge = 1; edge < edges; edge++ ) { DECLARE_ALIGNED(8, int16_t, bS)[4]; int qp; + const int deblock_edge = !IS_8x8DCT(mb_type & (edge<<24)); // (edge&1) && IS_8x8DCT(mb_type) - if( IS_8x8DCT(mb_type & (edge<<24)) ) // (edge&1) && IS_8x8DCT(mb_type) + if (!deblock_edge && (!chroma422 || dir == 0)) continue; if( IS_INTRA(mb_type)) { @@ -627,14 +648,23 @@ static av_always_inline void filter_mb_dir(H264Context *h, int mb_x, int mb_y, u } } } else { - filter_mb_edgeh( &img_y[4*edge*linesize], linesize, bS, qp, a, b, h, 0 ); - if (chroma) { - if (chroma444) { - filter_mb_edgeh ( &img_cb[4*edge*uvlinesize], uvlinesize, bS, h->chroma_qp[0], a, b, h, 0); - filter_mb_edgeh ( &img_cr[4*edge*uvlinesize], uvlinesize, bS, h->chroma_qp[1], a, b, h, 0); - } else if( (edge&1) == 0 ) { - filter_mb_edgech( &img_cb[2*edge*uvlinesize], uvlinesize, bS, h->chroma_qp[0], a, b, h, 0); - filter_mb_edgech( &img_cr[2*edge*uvlinesize], uvlinesize, bS, h->chroma_qp[1], a, b, h, 0); + if (chroma422) { + if (deblock_edge) + filter_mb_edgeh(&img_y[4*edge*linesize], linesize, bS, qp, a, b, h, 0); + if (chroma) { + filter_mb_edgech(&img_cb[4*edge*uvlinesize], uvlinesize, bS, h->chroma_qp[0], a, b, h, 0); + filter_mb_edgech(&img_cr[4*edge*uvlinesize], uvlinesize, bS, h->chroma_qp[1], a, b, h, 0); + } + } else { + filter_mb_edgeh(&img_y[4*edge*linesize], linesize, bS, qp, a, b, h, 0); + if (chroma) { + if (chroma444) { + filter_mb_edgeh (&img_cb[4*edge*uvlinesize], uvlinesize, bS, h->chroma_qp[0], a, b, h, 0); + filter_mb_edgeh (&img_cr[4*edge*uvlinesize], uvlinesize, bS, h->chroma_qp[1], a, b, h, 0); + } else if ((edge&1) == 0) { + filter_mb_edgech(&img_cb[2*edge*uvlinesize], uvlinesize, bS, h->chroma_qp[0], a, b, h, 0); + filter_mb_edgech(&img_cr[2*edge*uvlinesize], uvlinesize, bS, h->chroma_qp[1], a, b, h, 0); + } } } } @@ -726,6 +756,11 @@ void ff_h264_filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint filter_mb_mbaff_edgev ( h, img_cb + 8*uvlinesize, uvlinesize, bS+4, 1, bqp[1], a, b, 1 ); filter_mb_mbaff_edgev ( h, img_cr, uvlinesize, bS , 1, rqp[0], a, b, 1 ); filter_mb_mbaff_edgev ( h, img_cr + 8*uvlinesize, uvlinesize, bS+4, 1, rqp[1], a, b, 1 ); + } else if (CHROMA422) { + filter_mb_mbaff_edgecv(h, img_cb, uvlinesize, bS , 1, bqp[0], a, b, 1); + filter_mb_mbaff_edgecv(h, img_cb + 8*uvlinesize, uvlinesize, bS+4, 1, bqp[1], a, b, 1); + filter_mb_mbaff_edgecv(h, img_cr, uvlinesize, bS , 1, rqp[0], a, b, 1); + filter_mb_mbaff_edgecv(h, img_cr + 8*uvlinesize, uvlinesize, bS+4, 1, rqp[1], a, b, 1); }else{ filter_mb_mbaff_edgecv( h, img_cb, uvlinesize, bS , 1, bqp[0], a, b, 1 ); filter_mb_mbaff_edgecv( h, img_cb + 4*uvlinesize, uvlinesize, bS+4, 1, bqp[1], a, b, 1 ); @@ -754,9 +789,9 @@ void ff_h264_filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint #if CONFIG_SMALL for( dir = 0; dir < 2; dir++ ) - filter_mb_dir(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize, mb_xy, mb_type, mvy_limit, dir ? 0 : first_vertical_edge_done, a, b, chroma, CHROMA444, dir); + filter_mb_dir(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize, mb_xy, mb_type, mvy_limit, dir ? 0 : first_vertical_edge_done, a, b, chroma, dir); #else - filter_mb_dir(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize, mb_xy, mb_type, mvy_limit, first_vertical_edge_done, a, b, chroma, CHROMA444, 0); - filter_mb_dir(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize, mb_xy, mb_type, mvy_limit, 0, a, b, chroma, CHROMA444, 1); + filter_mb_dir(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize, mb_xy, mb_type, mvy_limit, first_vertical_edge_done, a, b, chroma, 0); + filter_mb_dir(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize, mb_xy, mb_type, mvy_limit, 0, a, b, chroma, 1); #endif } diff --git a/libavcodec/h264_mvpred.h b/libavcodec/h264_mvpred.h index 7c7086d440..4b6a083bb4 100644 --- a/libavcodec/h264_mvpred.h +++ b/libavcodec/h264_mvpred.h @@ -510,7 +510,7 @@ static void fill_decode_caches(H264Context *h, int mb_type){ if(top_type){ nnz = h->non_zero_count[top_xy]; AV_COPY32(&nnz_cache[4+8* 0], &nnz[4*3]); - if(CHROMA444){ + if(!s->chroma_y_shift){ AV_COPY32(&nnz_cache[4+8* 5], &nnz[4* 7]); AV_COPY32(&nnz_cache[4+8*10], &nnz[4*11]); }else{ @@ -534,6 +534,11 @@ static void fill_decode_caches(H264Context *h, int mb_type){ nnz_cache[3+8* 7 + 2*8*i]= nnz[left_block[8+1+2*i]+4*4]; nnz_cache[3+8*11 + 2*8*i]= nnz[left_block[8+0+2*i]+8*4]; nnz_cache[3+8*12 + 2*8*i]= nnz[left_block[8+1+2*i]+8*4]; + }else if(CHROMA422) { + nnz_cache[3+8* 6 + 2*8*i]= nnz[left_block[8+0+2*i]-2+4*4]; + nnz_cache[3+8* 7 + 2*8*i]= nnz[left_block[8+1+2*i]-2+4*4]; + nnz_cache[3+8*11 + 2*8*i]= nnz[left_block[8+0+2*i]-2+8*4]; + nnz_cache[3+8*12 + 2*8*i]= nnz[left_block[8+1+2*i]-2+8*4]; }else{ nnz_cache[3+8* 6 + 8*i]= nnz[left_block[8+4+2*i]]; nnz_cache[3+8*11 + 8*i]= nnz[left_block[8+5+2*i]]; diff --git a/libavcodec/h264_ps.c b/libavcodec/h264_ps.c index 677ca80abb..76bf116a3f 100644 --- a/libavcodec/h264_ps.c +++ b/libavcodec/h264_ps.c @@ -396,7 +396,8 @@ int ff_h264_decode_seq_parameter_set(H264Context *h){ #endif sps->crop= get_bits1(&s->gb); if(sps->crop){ - int crop_limit = sps->chroma_format_idc == 3 ? 16 : 8; + int crop_vertical_limit = sps->chroma_format_idc & 2 ? 16 : 8; + int crop_horizontal_limit = sps->chroma_format_idc == 3 ? 16 : 8; sps->crop_left = get_ue_golomb(&s->gb); sps->crop_right = get_ue_golomb(&s->gb); sps->crop_top = get_ue_golomb(&s->gb); @@ -404,7 +405,7 @@ int ff_h264_decode_seq_parameter_set(H264Context *h){ if(sps->crop_left || sps->crop_top){ av_log(h->s.avctx, AV_LOG_ERROR, "insane cropping not completely supported, this could look slightly wrong ...\n"); } - if(sps->crop_right >= crop_limit || sps->crop_bottom >= crop_limit){ + if(sps->crop_right >= crop_horizontal_limit || sps->crop_bottom >= crop_vertical_limit){ av_log(h->s.avctx, AV_LOG_ERROR, "brainfart cropping not supported, this could look slightly wrong ...\n"); } }else{ diff --git a/libavcodec/h264data.h b/libavcodec/h264data.h index 1851169dd3..2cfa548624 100644 --- a/libavcodec/h264data.h +++ b/libavcodec/h264data.h @@ -80,7 +80,14 @@ static const uint8_t luma_dc_field_scan[16]={ static const uint8_t chroma_dc_scan[4]={ (0+0*2)*16, (1+0*2)*16, - (0+1*2)*16, (1+1*2)*16, //FIXME + (0+1*2)*16, (1+1*2)*16, +}; + +static const uint8_t chroma422_dc_scan[8]={ + (0+0*2)*16, (0+1*2)*16, + (1+0*2)*16, (0+2*2)*16, + (0+3*2)*16, (1+1*2)*16, + (1+2*2)*16, (1+3*2)*16, }; // zigzag_scan8x8_cavlc[i] = zigzag_scan8x8[(i/4) + 16*(i%4)] diff --git a/libavcodec/h264dsp.c b/libavcodec/h264dsp.c index 64f4856189..19ad2db3d9 100644 --- a/libavcodec/h264dsp.c +++ b/libavcodec/h264dsp.c @@ -41,7 +41,7 @@ #include "h264dsp_template.c" #undef BIT_DEPTH -void ff_h264dsp_init(H264DSPContext *c, const int bit_depth) +void ff_h264dsp_init(H264DSPContext *c, const int bit_depth, const int chroma_format_idc) { #undef FUNC #define FUNC(a, depth) a ## _ ## depth ## _c @@ -53,10 +53,16 @@ void ff_h264dsp_init(H264DSPContext *c, const int bit_depth) c->h264_idct8_dc_add= FUNC(ff_h264_idct8_dc_add, depth);\ c->h264_idct_add16 = FUNC(ff_h264_idct_add16, depth);\ c->h264_idct8_add4 = FUNC(ff_h264_idct8_add4, depth);\ - c->h264_idct_add8 = FUNC(ff_h264_idct_add8, depth);\ + if (chroma_format_idc == 1)\ + c->h264_idct_add8 = FUNC(ff_h264_idct_add8, depth);\ + else\ + c->h264_idct_add8 = FUNC(ff_h264_idct_add8_422, depth);\ c->h264_idct_add16intra= FUNC(ff_h264_idct_add16intra, depth);\ c->h264_luma_dc_dequant_idct= FUNC(ff_h264_luma_dc_dequant_idct, depth);\ - c->h264_chroma_dc_dequant_idct= FUNC(ff_h264_chroma_dc_dequant_idct, depth);\ + if (chroma_format_idc == 1)\ + c->h264_chroma_dc_dequant_idct= FUNC(ff_h264_chroma_dc_dequant_idct, depth);\ + else\ + c->h264_chroma_dc_dequant_idct= FUNC(ff_h264_chroma422_dc_dequant_idct, depth);\ \ c->weight_h264_pixels_tab[0]= FUNC(weight_h264_pixels16x16, depth);\ c->weight_h264_pixels_tab[1]= FUNC(weight_h264_pixels16x8, depth);\ @@ -86,11 +92,23 @@ void ff_h264dsp_init(H264DSPContext *c, const int bit_depth) c->h264_h_loop_filter_luma_intra= FUNC(h264_h_loop_filter_luma_intra, depth);\ c->h264_h_loop_filter_luma_mbaff_intra= FUNC(h264_h_loop_filter_luma_mbaff_intra, depth);\ c->h264_v_loop_filter_chroma= FUNC(h264_v_loop_filter_chroma, depth);\ - c->h264_h_loop_filter_chroma= FUNC(h264_h_loop_filter_chroma, depth);\ - c->h264_h_loop_filter_chroma_mbaff= FUNC(h264_h_loop_filter_chroma_mbaff, depth);\ + if (chroma_format_idc == 1)\ + c->h264_h_loop_filter_chroma= FUNC(h264_h_loop_filter_chroma, depth);\ + else\ + c->h264_h_loop_filter_chroma= FUNC(h264_h_loop_filter_chroma422, depth);\ + if (chroma_format_idc == 1)\ + c->h264_h_loop_filter_chroma_mbaff= FUNC(h264_h_loop_filter_chroma_mbaff, depth);\ + else\ + c->h264_h_loop_filter_chroma_mbaff= FUNC(h264_h_loop_filter_chroma422_mbaff, depth);\ c->h264_v_loop_filter_chroma_intra= FUNC(h264_v_loop_filter_chroma_intra, depth);\ - c->h264_h_loop_filter_chroma_intra= FUNC(h264_h_loop_filter_chroma_intra, depth);\ - c->h264_h_loop_filter_chroma_mbaff_intra= FUNC(h264_h_loop_filter_chroma_mbaff_intra, depth);\ + if (chroma_format_idc == 1)\ + c->h264_h_loop_filter_chroma_intra= FUNC(h264_h_loop_filter_chroma_intra, depth);\ + else\ + c->h264_h_loop_filter_chroma_intra= FUNC(h264_h_loop_filter_chroma422_intra, depth);\ + if (chroma_format_idc == 1)\ + c->h264_h_loop_filter_chroma_mbaff_intra= FUNC(h264_h_loop_filter_chroma_mbaff_intra, depth);\ + else\ + c->h264_h_loop_filter_chroma_mbaff_intra= FUNC(h264_h_loop_filter_chroma422_mbaff_intra, depth);\ c->h264_loop_filter_strength= NULL; switch (bit_depth) { @@ -105,7 +123,7 @@ void ff_h264dsp_init(H264DSPContext *c, const int bit_depth) break; } - if (ARCH_ARM) ff_h264dsp_init_arm(c, bit_depth); - if (HAVE_ALTIVEC) ff_h264dsp_init_ppc(c, bit_depth); - if (HAVE_MMX) ff_h264dsp_init_x86(c, bit_depth); + if (ARCH_ARM) ff_h264dsp_init_arm(c, bit_depth, chroma_format_idc); + if (HAVE_ALTIVEC) ff_h264dsp_init_ppc(c, bit_depth, chroma_format_idc); + if (HAVE_MMX) ff_h264dsp_init_x86(c, bit_depth, chroma_format_idc); } diff --git a/libavcodec/h264dsp.h b/libavcodec/h264dsp.h index 6972725781..7337f178e9 100644 --- a/libavcodec/h264dsp.h +++ b/libavcodec/h264dsp.h @@ -74,9 +74,9 @@ typedef struct H264DSPContext{ void (*h264_chroma_dc_dequant_idct)(DCTELEM *block, int qmul); }H264DSPContext; -void ff_h264dsp_init(H264DSPContext *c, const int bit_depth); -void ff_h264dsp_init_arm(H264DSPContext *c, const int bit_depth); -void ff_h264dsp_init_ppc(H264DSPContext *c, const int bit_depth); -void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth); +void ff_h264dsp_init(H264DSPContext *c, const int bit_depth, const int chroma_format_idc); +void ff_h264dsp_init_arm(H264DSPContext *c, const int bit_depth, const int chroma_format_idc); +void ff_h264dsp_init_ppc(H264DSPContext *c, const int bit_depth, const int chroma_format_idc); +void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, const int chroma_format_idc); #endif /* AVCODEC_H264DSP_H */ diff --git a/libavcodec/h264dsp_template.c b/libavcodec/h264dsp_template.c index d11eff0919..ee4bbe51dc 100644 --- a/libavcodec/h264dsp_template.c +++ b/libavcodec/h264dsp_template.c @@ -275,6 +275,14 @@ static void FUNCC(h264_h_loop_filter_chroma_mbaff)(uint8_t *pix, int stride, int { FUNCC(h264_loop_filter_chroma)(pix, sizeof(pixel), stride, 1, alpha, beta, tc0); } +static void FUNCC(h264_h_loop_filter_chroma422)(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) +{ + FUNCC(h264_loop_filter_chroma)(pix, sizeof(pixel), stride, 4, alpha, beta, tc0); +} +static void FUNCC(h264_h_loop_filter_chroma422_mbaff)(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) +{ + FUNCC(h264_loop_filter_chroma)(pix, sizeof(pixel), stride, 2, alpha, beta, tc0); +} static av_always_inline av_flatten void FUNCC(h264_loop_filter_chroma_intra)(uint8_t *_pix, int xstride, int ystride, int inner_iters, int alpha, int beta) { @@ -312,3 +320,11 @@ static void FUNCC(h264_h_loop_filter_chroma_mbaff_intra)(uint8_t *pix, int strid { FUNCC(h264_loop_filter_chroma_intra)(pix, sizeof(pixel), stride, 1, alpha, beta); } +static void FUNCC(h264_h_loop_filter_chroma422_intra)(uint8_t *pix, int stride, int alpha, int beta) +{ + FUNCC(h264_loop_filter_chroma_intra)(pix, sizeof(pixel), stride, 4, alpha, beta); +} +static void FUNCC(h264_h_loop_filter_chroma422_mbaff_intra)(uint8_t *pix, int stride, int alpha, int beta) +{ + FUNCC(h264_loop_filter_chroma_intra)(pix, sizeof(pixel), stride, 2, alpha, beta); +} diff --git a/libavcodec/h264idct_template.c b/libavcodec/h264idct_template.c index ba5571576d..eba850ac6f 100644 --- a/libavcodec/h264idct_template.c +++ b/libavcodec/h264idct_template.c @@ -224,6 +224,29 @@ void FUNCC(ff_h264_idct_add8)(uint8_t **dest, const int *block_offset, DCTELEM * } } } + +void FUNCC(ff_h264_idct_add8_422)(uint8_t **dest, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[15*8]){ + int i, j; + + for(j=1; j<3; j++){ + for(i=j*16; i> 8; + block[stride*1+offset]= ((z1 + z2)*qmul + 128) >> 8; + block[stride*2+offset]= ((z1 - z2)*qmul + 128) >> 8; + block[stride*3+offset]= ((z0 - z3)*qmul + 128) >> 8; + } +} + void FUNCC(ff_h264_chroma_dc_dequant_idct)(DCTELEM *_block, int qmul){ const int stride= 16*2; const int xStride= 16; diff --git a/libavcodec/h264pred.c b/libavcodec/h264pred.c index e73d82c547..17199d01e6 100644 --- a/libavcodec/h264pred.c +++ b/libavcodec/h264pred.c @@ -361,7 +361,7 @@ static void pred8x8_tm_vp8_c(uint8_t *src, int stride){ /** * Set the intra prediction function pointers. */ -void ff_h264_pred_init(H264PredContext *h, int codec_id, const int bit_depth){ +void ff_h264_pred_init(H264PredContext *h, int codec_id, const int bit_depth, const int chroma_format_idc){ // MpegEncContext * const s = &h->s; #undef FUNC @@ -434,20 +434,39 @@ void ff_h264_pred_init(H264PredContext *h, int codec_id, const int bit_depth){ h->pred8x8l[TOP_DC_PRED ]= FUNCC(pred8x8l_top_dc , depth);\ h->pred8x8l[DC_128_PRED ]= FUNCC(pred8x8l_128_dc , depth);\ \ - h->pred8x8[VERT_PRED8x8 ]= FUNCC(pred8x8_vertical , depth);\ - h->pred8x8[HOR_PRED8x8 ]= FUNCC(pred8x8_horizontal , depth);\ + if (chroma_format_idc == 1) {\ + h->pred8x8[VERT_PRED8x8 ]= FUNCC(pred8x8_vertical , depth);\ + h->pred8x8[HOR_PRED8x8 ]= FUNCC(pred8x8_horizontal , depth);\ + } else {\ + h->pred8x8[VERT_PRED8x8 ]= FUNCC(pred8x16_vertical , depth);\ + h->pred8x8[HOR_PRED8x8 ]= FUNCC(pred8x16_horizontal , depth);\ + }\ if (codec_id != CODEC_ID_VP8) {\ - h->pred8x8[PLANE_PRED8x8]= FUNCC(pred8x8_plane , depth);\ + if (chroma_format_idc == 1) {\ + h->pred8x8[PLANE_PRED8x8]= FUNCC(pred8x8_plane , depth);\ + } else {\ + h->pred8x8[PLANE_PRED8x8]= FUNCC(pred8x16_plane , depth);\ + }\ } else\ h->pred8x8[PLANE_PRED8x8]= FUNCD(pred8x8_tm_vp8);\ if(codec_id != CODEC_ID_RV40 && codec_id != CODEC_ID_VP8){\ - h->pred8x8[DC_PRED8x8 ]= FUNCC(pred8x8_dc , depth);\ - h->pred8x8[LEFT_DC_PRED8x8]= FUNCC(pred8x8_left_dc , depth);\ - h->pred8x8[TOP_DC_PRED8x8 ]= FUNCC(pred8x8_top_dc , depth);\ - h->pred8x8[ALZHEIMER_DC_L0T_PRED8x8 ]= FUNC(pred8x8_mad_cow_dc_l0t, depth);\ - h->pred8x8[ALZHEIMER_DC_0LT_PRED8x8 ]= FUNC(pred8x8_mad_cow_dc_0lt, depth);\ - h->pred8x8[ALZHEIMER_DC_L00_PRED8x8 ]= FUNC(pred8x8_mad_cow_dc_l00, depth);\ - h->pred8x8[ALZHEIMER_DC_0L0_PRED8x8 ]= FUNC(pred8x8_mad_cow_dc_0l0, depth);\ + if (chroma_format_idc == 1) {\ + h->pred8x8[DC_PRED8x8 ]= FUNCC(pred8x8_dc , depth);\ + h->pred8x8[LEFT_DC_PRED8x8]= FUNCC(pred8x8_left_dc , depth);\ + h->pred8x8[TOP_DC_PRED8x8 ]= FUNCC(pred8x8_top_dc , depth);\ + h->pred8x8[ALZHEIMER_DC_L0T_PRED8x8 ]= FUNC(pred8x8_mad_cow_dc_l0t, depth);\ + h->pred8x8[ALZHEIMER_DC_0LT_PRED8x8 ]= FUNC(pred8x8_mad_cow_dc_0lt, depth);\ + h->pred8x8[ALZHEIMER_DC_L00_PRED8x8 ]= FUNC(pred8x8_mad_cow_dc_l00, depth);\ + h->pred8x8[ALZHEIMER_DC_0L0_PRED8x8 ]= FUNC(pred8x8_mad_cow_dc_0l0, depth);\ + } else {\ + h->pred8x8[DC_PRED8x8 ]= FUNCC(pred8x16_dc , depth);\ + h->pred8x8[LEFT_DC_PRED8x8]= FUNCC(pred8x16_left_dc , depth);\ + h->pred8x8[TOP_DC_PRED8x8 ]= FUNCC(pred8x16_top_dc , depth);\ + h->pred8x8[ALZHEIMER_DC_L0T_PRED8x8 ]= FUNC(pred8x8_mad_cow_dc_l0t, depth);\ + h->pred8x8[ALZHEIMER_DC_0LT_PRED8x8 ]= FUNC(pred8x8_mad_cow_dc_0lt, depth);\ + h->pred8x8[ALZHEIMER_DC_L00_PRED8x8 ]= FUNC(pred8x8_mad_cow_dc_l00, depth);\ + h->pred8x8[ALZHEIMER_DC_0L0_PRED8x8 ]= FUNC(pred8x8_mad_cow_dc_0l0, depth);\ + }\ }else{\ h->pred8x8[DC_PRED8x8 ]= FUNCD(pred8x8_dc_rv40);\ h->pred8x8[LEFT_DC_PRED8x8]= FUNCD(pred8x8_left_dc_rv40);\ @@ -457,7 +476,11 @@ void ff_h264_pred_init(H264PredContext *h, int codec_id, const int bit_depth){ h->pred8x8[DC_129_PRED8x8]= FUNCC(pred8x8_129_dc , depth);\ }\ }\ - h->pred8x8[DC_128_PRED8x8 ]= FUNCC(pred8x8_128_dc , depth);\ + if (chroma_format_idc == 1) {\ + h->pred8x8[DC_128_PRED8x8 ]= FUNCC(pred8x8_128_dc , depth);\ + } else {\ + h->pred8x8[DC_128_PRED8x8 ]= FUNCC(pred8x16_128_dc , depth);\ + }\ \ h->pred16x16[DC_PRED8x8 ]= FUNCC(pred16x16_dc , depth);\ h->pred16x16[VERT_PRED8x8 ]= FUNCC(pred16x16_vertical , depth);\ @@ -504,6 +527,6 @@ void ff_h264_pred_init(H264PredContext *h, int codec_id, const int bit_depth){ break; } - if (ARCH_ARM) ff_h264_pred_init_arm(h, codec_id, bit_depth); - if (HAVE_MMX) ff_h264_pred_init_x86(h, codec_id, bit_depth); + if (ARCH_ARM) ff_h264_pred_init_arm(h, codec_id, bit_depth, chroma_format_idc); + if (HAVE_MMX) ff_h264_pred_init_x86(h, codec_id, bit_depth, chroma_format_idc); } diff --git a/libavcodec/h264pred.h b/libavcodec/h264pred.h index 34b1e90bbc..b880446121 100644 --- a/libavcodec/h264pred.h +++ b/libavcodec/h264pred.h @@ -101,8 +101,8 @@ typedef struct H264PredContext{ void (*pred16x16_add[3])(uint8_t *pix/*align 16*/, const int *block_offset, const DCTELEM *block/*align 16*/, int stride); }H264PredContext; -void ff_h264_pred_init(H264PredContext *h, int codec_id, const int bit_depth); -void ff_h264_pred_init_arm(H264PredContext *h, int codec_id, const int bit_depth); -void ff_h264_pred_init_x86(H264PredContext *h, int codec_id, const int bit_depth); +void ff_h264_pred_init(H264PredContext *h, int codec_id, const int bit_depth, const int chroma_format_idc); +void ff_h264_pred_init_arm(H264PredContext *h, int codec_id, const int bit_depth, const int chroma_format_idc); +void ff_h264_pred_init_x86(H264PredContext *h, int codec_id, const int bit_depth, const int chroma_format_idc); #endif /* AVCODEC_H264PRED_H */ diff --git a/libavcodec/h264pred_template.c b/libavcodec/h264pred_template.c index 750e82c12a..d4f654e18c 100644 --- a/libavcodec/h264pred_template.c +++ b/libavcodec/h264pred_template.c @@ -454,6 +454,19 @@ static void FUNCC(pred8x8_vertical)(uint8_t *_src, int _stride){ } } +static void FUNCC(pred8x16_vertical)(uint8_t *_src, int _stride){ + int i; + pixel *src = (pixel*)_src; + int stride = _stride>>(sizeof(pixel)-1); + const pixel4 a= AV_RN4PA(((pixel4*)(src-stride))+0); + const pixel4 b= AV_RN4PA(((pixel4*)(src-stride))+1); + + for(i=0; i<16; i++){ + AV_WN4PA(((pixel4*)(src+i*stride))+0, a); + AV_WN4PA(((pixel4*)(src+i*stride))+1, b); + } +} + static void FUNCC(pred8x8_horizontal)(uint8_t *_src, int stride){ int i; pixel *src = (pixel*)_src; @@ -466,6 +479,17 @@ static void FUNCC(pred8x8_horizontal)(uint8_t *_src, int stride){ } } +static void FUNCC(pred8x16_horizontal)(uint8_t *_src, int stride){ + int i; + pixel *src = (pixel*)_src; + stride >>= sizeof(pixel)-1; + for(i=0; i<16; i++){ + const pixel4 a = PIXEL_SPLAT_X4(src[-1+i*stride]); + AV_WN4PA(((pixel4*)(src+i*stride))+0, a); + AV_WN4PA(((pixel4*)(src+i*stride))+1, a); + } +} + #define PRED8x8_X(n, v)\ static void FUNCC(pred8x8_##n##_dc)(uint8_t *_src, int stride){\ int i;\ @@ -482,6 +506,11 @@ PRED8x8_X(127, (1<<(BIT_DEPTH-1))-1); PRED8x8_X(128, (1<<(BIT_DEPTH-1))+0); PRED8x8_X(129, (1<<(BIT_DEPTH-1))+1); +static void FUNCC(pred8x16_128_dc)(uint8_t *_src, int stride){ + FUNCC(pred8x8_128_dc)(_src, stride); + FUNCC(pred8x8_128_dc)(_src+8*stride, stride); +} + static void FUNCC(pred8x8_left_dc)(uint8_t *_src, int stride){ int i; int dc0, dc2; @@ -507,6 +536,11 @@ static void FUNCC(pred8x8_left_dc)(uint8_t *_src, int stride){ } } +static void FUNCC(pred8x16_left_dc)(uint8_t *_src, int stride){ + FUNCC(pred8x8_left_dc)(_src, stride); + FUNCC(pred8x8_left_dc)(_src+8*stride, stride); +} + static void FUNCC(pred8x8_top_dc)(uint8_t *_src, int stride){ int i; int dc0, dc1; @@ -532,6 +566,27 @@ static void FUNCC(pred8x8_top_dc)(uint8_t *_src, int stride){ } } +static void FUNCC(pred8x16_top_dc)(uint8_t *_src, int stride){ + int i; + int dc0, dc1; + pixel4 dc0splat, dc1splat; + pixel *src = (pixel*)_src; + stride >>= sizeof(pixel)-1; + + dc0=dc1=0; + for(i=0;i<4; i++){ + dc0+= src[i-stride]; + dc1+= src[4+i-stride]; + } + dc0splat = PIXEL_SPLAT_X4((dc0 + 2)>>2); + dc1splat = PIXEL_SPLAT_X4((dc1 + 2)>>2); + + for(i=0; i<16; i++){ + AV_WN4PA(((pixel4*)(src+i*stride))+0, dc0splat); + AV_WN4PA(((pixel4*)(src+i*stride))+1, dc1splat); + } +} + static void FUNCC(pred8x8_dc)(uint8_t *_src, int stride){ int i; int dc0, dc1, dc2; @@ -560,6 +615,48 @@ static void FUNCC(pred8x8_dc)(uint8_t *_src, int stride){ } } +static void FUNCC(pred8x16_dc)(uint8_t *_src, int stride){ + int i; + int dc0, dc1, dc2, dc3, dc4; + pixel4 dc0splat, dc1splat, dc2splat, dc3splat, dc4splat, dc5splat, dc6splat, dc7splat; + pixel *src = (pixel*)_src; + stride >>= sizeof(pixel)-1; + + dc0=dc1=dc2=dc3=dc4=0; + for(i=0;i<4; i++){ + dc0+= src[-1+i*stride] + src[i-stride]; + dc1+= src[4+i-stride]; + dc2+= src[-1+(i+4)*stride]; + dc3+= src[-1+(i+8)*stride]; + dc4+= src[-1+(i+12)*stride]; + } + dc0splat = PIXEL_SPLAT_X4((dc0 + 4)>>3); + dc1splat = PIXEL_SPLAT_X4((dc1 + 2)>>2); + dc2splat = PIXEL_SPLAT_X4((dc2 + 2)>>2); + dc3splat = PIXEL_SPLAT_X4((dc1 + dc2 + 4)>>3); + dc4splat = PIXEL_SPLAT_X4((dc3 + 2)>>2); + dc5splat = PIXEL_SPLAT_X4((dc1 + dc3 + 4)>>3); + dc6splat = PIXEL_SPLAT_X4((dc4 + 2)>>2); + dc7splat = PIXEL_SPLAT_X4((dc1 + dc4 + 4)>>3); + + for(i=0; i<4; i++){ + AV_WN4PA(((pixel4*)(src+i*stride))+0, dc0splat); + AV_WN4PA(((pixel4*)(src+i*stride))+1, dc1splat); + } + for(i=4; i<8; i++){ + AV_WN4PA(((pixel4*)(src+i*stride))+0, dc2splat); + AV_WN4PA(((pixel4*)(src+i*stride))+1, dc3splat); + } + for(i=8; i<12; i++){ + AV_WN4PA(((pixel4*)(src+i*stride))+0, dc4splat); + AV_WN4PA(((pixel4*)(src+i*stride))+1, dc5splat); + } + for(i=12; i<16; i++){ + AV_WN4PA(((pixel4*)(src+i*stride))+0, dc6splat); + AV_WN4PA(((pixel4*)(src+i*stride))+1, dc7splat); + } +} + //the following 4 function should not be optimized! static void FUNC(pred8x8_mad_cow_dc_l0t)(uint8_t *src, int stride){ FUNCC(pred8x8_top_dc)(src, stride); @@ -618,6 +715,47 @@ static void FUNCC(pred8x8_plane)(uint8_t *_src, int _stride){ } } +static void FUNCC(pred8x16_plane)(uint8_t *_src, int _stride){ + int j, k; + int a; + INIT_CLIP + pixel *src = (pixel*)_src; + int stride = _stride>>(sizeof(pixel)-1); + const pixel * const src0 = src +3-stride; + const pixel * src1 = src +8*stride-1; + const pixel * src2 = src1-2*stride; // == src+6*stride-1; + int H = src0[1] - src0[-1]; + int V = src1[0] - src2[ 0]; + + for (k = 2; k <= 4; ++k) { + src1 += stride; src2 -= stride; + H += k*(src0[k] - src0[-k]); + V += k*(src1[0] - src2[ 0]); + } + for (; k <= 8; ++k) { + src1 += stride; src2 -= stride; + V += k*(src1[0] - src2[0]); + } + + H = (17*H+16) >> 5; + V = (5*V+32) >> 6; + + a = 16*(src1[0] + src2[8] + 1) - 7*V - 3*H; + for(j=16; j>0; --j) { + int b = a; + a += V; + src[0] = CLIP((b ) >> 5); + src[1] = CLIP((b+ H) >> 5); + src[2] = CLIP((b+2*H) >> 5); + src[3] = CLIP((b+3*H) >> 5); + src[4] = CLIP((b+4*H) >> 5); + src[5] = CLIP((b+5*H) >> 5); + src[6] = CLIP((b+6*H) >> 5); + src[7] = CLIP((b+7*H) >> 5); + src += stride; + } +} + #define SRC(x,y) src[(x)+(y)*stride] #define PL(y) \ const int l##y = (SRC(-1,y-1) + 2*SRC(-1,y) + SRC(-1,y+1) + 2) >> 2; diff --git a/libavcodec/ppc/h264_altivec.c b/libavcodec/ppc/h264_altivec.c index 8dd4ea392e..a9153788de 100644 --- a/libavcodec/ppc/h264_altivec.c +++ b/libavcodec/ppc/h264_altivec.c @@ -999,12 +999,13 @@ void dsputil_h264_init_ppc(DSPContext* c, AVCodecContext *avctx) { } } -void ff_h264dsp_init_ppc(H264DSPContext *c, const int bit_depth) +void ff_h264dsp_init_ppc(H264DSPContext *c, const int bit_depth, const int chroma_format_idc) { if (av_get_cpu_flags() & AV_CPU_FLAG_ALTIVEC) { if (bit_depth == 8) { c->h264_idct_add = ff_h264_idct_add_altivec; - c->h264_idct_add8 = ff_h264_idct_add8_altivec; + if (chroma_format_idc == 1) + c->h264_idct_add8 = ff_h264_idct_add8_altivec; c->h264_idct_add16 = ff_h264_idct_add16_altivec; c->h264_idct_add16intra = ff_h264_idct_add16intra_altivec; c->h264_idct_dc_add= h264_idct_dc_add_altivec; diff --git a/libavcodec/rv34.c b/libavcodec/rv34.c index b771a7f97e..091d49fdb5 100644 --- a/libavcodec/rv34.c +++ b/libavcodec/rv34.c @@ -1343,7 +1343,7 @@ av_cold int ff_rv34_decode_init(AVCodecContext *avctx) if (MPV_common_init(s) < 0) return -1; - ff_h264_pred_init(&r->h, CODEC_ID_RV40, 8); + ff_h264_pred_init(&r->h, CODEC_ID_RV40, 8, 1); #if CONFIG_RV30_DECODER if (avctx->codec_id == CODEC_ID_RV30) diff --git a/libavcodec/vp8.c b/libavcodec/vp8.c index d5cdaba486..95755e330a 100644 --- a/libavcodec/vp8.c +++ b/libavcodec/vp8.c @@ -1769,7 +1769,7 @@ static av_cold int vp8_decode_init(AVCodecContext *avctx) avctx->pix_fmt = PIX_FMT_YUV420P; dsputil_init(&s->dsp, avctx); - ff_h264_pred_init(&s->hpc, CODEC_ID_VP8, 8); + ff_h264_pred_init(&s->hpc, CODEC_ID_VP8, 8, 1); ff_vp8dsp_init(&s->vp8dsp); return 0; diff --git a/libavcodec/x86/h264_intrapred_init.c b/libavcodec/x86/h264_intrapred_init.c index 414d5e6125..41e611ecd1 100644 --- a/libavcodec/x86/h264_intrapred_init.c +++ b/libavcodec/x86/h264_intrapred_init.c @@ -167,7 +167,7 @@ void ff_pred4x4_tm_vp8_mmxext (uint8_t *src, const uint8_t *topright, int s void ff_pred4x4_tm_vp8_ssse3 (uint8_t *src, const uint8_t *topright, int stride); void ff_pred4x4_vertical_vp8_mmxext(uint8_t *src, const uint8_t *topright, int stride); -void ff_h264_pred_init_x86(H264PredContext *h, int codec_id, const int bit_depth) +void ff_h264_pred_init_x86(H264PredContext *h, int codec_id, const int bit_depth, const int chroma_format_idc) { #if HAVE_YASM int mm_flags = av_get_cpu_flags(); @@ -176,14 +176,17 @@ void ff_h264_pred_init_x86(H264PredContext *h, int codec_id, const int bit_depth if (mm_flags & AV_CPU_FLAG_MMX) { h->pred16x16[VERT_PRED8x8 ] = ff_pred16x16_vertical_mmx; h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_horizontal_mmx; - h->pred8x8 [VERT_PRED8x8 ] = ff_pred8x8_vertical_mmx; - h->pred8x8 [HOR_PRED8x8 ] = ff_pred8x8_horizontal_mmx; + if (chroma_format_idc == 1) { + h->pred8x8 [VERT_PRED8x8 ] = ff_pred8x8_vertical_mmx; + h->pred8x8 [HOR_PRED8x8 ] = ff_pred8x8_horizontal_mmx; + } if (codec_id == CODEC_ID_VP8) { h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_tm_vp8_mmx; h->pred8x8 [PLANE_PRED8x8 ] = ff_pred8x8_tm_vp8_mmx; h->pred4x4 [TM_VP8_PRED ] = ff_pred4x4_tm_vp8_mmx; } else { - h->pred8x8 [PLANE_PRED8x8] = ff_pred8x8_plane_mmx; + if (chroma_format_idc == 1) + h->pred8x8 [PLANE_PRED8x8] = ff_pred8x8_plane_mmx; if (codec_id == CODEC_ID_SVQ3) { h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_svq3_mmx; } else if (codec_id == CODEC_ID_RV40) { @@ -197,7 +200,8 @@ void ff_h264_pred_init_x86(H264PredContext *h, int codec_id, const int bit_depth if (mm_flags & AV_CPU_FLAG_MMX2) { h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_horizontal_mmxext; h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_mmxext; - h->pred8x8 [HOR_PRED8x8 ] = ff_pred8x8_horizontal_mmxext; + if (chroma_format_idc == 1) + h->pred8x8[HOR_PRED8x8 ] = ff_pred8x8_horizontal_mmxext; h->pred8x8l [TOP_DC_PRED ] = ff_pred8x8l_top_dc_mmxext; h->pred8x8l [DC_PRED ] = ff_pred8x8l_dc_mmxext; h->pred8x8l [HOR_PRED ] = ff_pred8x8l_horizontal_mmxext; @@ -221,8 +225,10 @@ void ff_h264_pred_init_x86(H264PredContext *h, int codec_id, const int bit_depth h->pred4x4 [HOR_UP_PRED ] = ff_pred4x4_horizontal_up_mmxext; } if (codec_id == CODEC_ID_SVQ3 || codec_id == CODEC_ID_H264) { - h->pred8x8 [TOP_DC_PRED8x8 ] = ff_pred8x8_top_dc_mmxext; - h->pred8x8 [DC_PRED8x8 ] = ff_pred8x8_dc_mmxext; + if (chroma_format_idc == 1) { + h->pred8x8[TOP_DC_PRED8x8 ] = ff_pred8x8_top_dc_mmxext; + h->pred8x8[DC_PRED8x8 ] = ff_pred8x8_dc_mmxext; + } } if (codec_id == CODEC_ID_VP8) { h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_tm_vp8_mmxext; @@ -231,7 +237,8 @@ void ff_h264_pred_init_x86(H264PredContext *h, int codec_id, const int bit_depth h->pred4x4 [TM_VP8_PRED ] = ff_pred4x4_tm_vp8_mmxext; h->pred4x4 [VERT_PRED ] = ff_pred4x4_vertical_vp8_mmxext; } else { - h->pred8x8 [PLANE_PRED8x8] = ff_pred8x8_plane_mmx2; + if (chroma_format_idc == 1) + h->pred8x8 [PLANE_PRED8x8] = ff_pred8x8_plane_mmx2; if (codec_id == CODEC_ID_SVQ3) { h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_plane_svq3_mmx2; } else if (codec_id == CODEC_ID_RV40) { @@ -257,7 +264,8 @@ void ff_h264_pred_init_x86(H264PredContext *h, int codec_id, const int bit_depth h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_tm_vp8_sse2; h->pred8x8 [PLANE_PRED8x8 ] = ff_pred8x8_tm_vp8_sse2; } else { - h->pred8x8 [PLANE_PRED8x8 ] = ff_pred8x8_plane_sse2; + if (chroma_format_idc == 1) + h->pred8x8 [PLANE_PRED8x8] = ff_pred8x8_plane_sse2; if (codec_id == CODEC_ID_SVQ3) { h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_svq3_sse2; } else if (codec_id == CODEC_ID_RV40) { @@ -271,7 +279,8 @@ void ff_h264_pred_init_x86(H264PredContext *h, int codec_id, const int bit_depth if (mm_flags & AV_CPU_FLAG_SSSE3) { h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_horizontal_ssse3; h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_ssse3; - h->pred8x8 [HOR_PRED8x8 ] = ff_pred8x8_horizontal_ssse3; + if (chroma_format_idc == 1) + h->pred8x8 [HOR_PRED8x8 ] = ff_pred8x8_horizontal_ssse3; h->pred8x8l [TOP_DC_PRED ] = ff_pred8x8l_top_dc_ssse3; h->pred8x8l [DC_PRED ] = ff_pred8x8l_dc_ssse3; h->pred8x8l [HOR_PRED ] = ff_pred8x8l_horizontal_ssse3; @@ -286,7 +295,8 @@ void ff_h264_pred_init_x86(H264PredContext *h, int codec_id, const int bit_depth h->pred8x8 [PLANE_PRED8x8 ] = ff_pred8x8_tm_vp8_ssse3; h->pred4x4 [TM_VP8_PRED ] = ff_pred4x4_tm_vp8_ssse3; } else { - h->pred8x8 [PLANE_PRED8x8] = ff_pred8x8_plane_ssse3; + if (chroma_format_idc == 1) + h->pred8x8 [PLANE_PRED8x8] = ff_pred8x8_plane_ssse3; if (codec_id == CODEC_ID_SVQ3) { h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_svq3_ssse3; } else if (codec_id == CODEC_ID_RV40) { @@ -301,7 +311,8 @@ void ff_h264_pred_init_x86(H264PredContext *h, int codec_id, const int bit_depth h->pred4x4[DC_PRED ] = ff_pred4x4_dc_10_mmxext; h->pred4x4[HOR_UP_PRED ] = ff_pred4x4_horizontal_up_10_mmxext; - h->pred8x8[DC_PRED8x8 ] = ff_pred8x8_dc_10_mmxext; + if (chroma_format_idc == 1) + h->pred8x8[DC_PRED8x8 ] = ff_pred8x8_dc_10_mmxext; h->pred8x8l[DC_128_PRED ] = ff_pred8x8l_128_dc_10_mmxext; @@ -319,11 +330,13 @@ void ff_h264_pred_init_x86(H264PredContext *h, int codec_id, const int bit_depth h->pred4x4[VERT_RIGHT_PRED ] = ff_pred4x4_vertical_right_10_sse2; h->pred4x4[HOR_DOWN_PRED ] = ff_pred4x4_horizontal_down_10_sse2; - h->pred8x8[DC_PRED8x8 ] = ff_pred8x8_dc_10_sse2; - h->pred8x8[TOP_DC_PRED8x8 ] = ff_pred8x8_top_dc_10_sse2; - h->pred8x8[PLANE_PRED8x8 ] = ff_pred8x8_plane_10_sse2; - h->pred8x8[VERT_PRED8x8 ] = ff_pred8x8_vertical_10_sse2; - h->pred8x8[HOR_PRED8x8 ] = ff_pred8x8_horizontal_10_sse2; + if (chroma_format_idc == 1) { + h->pred8x8[DC_PRED8x8 ] = ff_pred8x8_dc_10_sse2; + h->pred8x8[TOP_DC_PRED8x8 ] = ff_pred8x8_top_dc_10_sse2; + h->pred8x8[PLANE_PRED8x8 ] = ff_pred8x8_plane_10_sse2; + h->pred8x8[VERT_PRED8x8 ] = ff_pred8x8_vertical_10_sse2; + h->pred8x8[HOR_PRED8x8 ] = ff_pred8x8_horizontal_10_sse2; + } h->pred8x8l[VERT_PRED ] = ff_pred8x8l_vertical_10_sse2; h->pred8x8l[HOR_PRED ] = ff_pred8x8l_horizontal_10_sse2; diff --git a/libavcodec/x86/h264dsp_mmx.c b/libavcodec/x86/h264dsp_mmx.c index 35ec267b42..910ad8401f 100644 --- a/libavcodec/x86/h264dsp_mmx.c +++ b/libavcodec/x86/h264dsp_mmx.c @@ -350,7 +350,7 @@ H264_BIWEIGHT_10_SSE( 4, 8, 10) H264_BIWEIGHT_10_SSE( 4, 4, 10) H264_BIWEIGHT_10_SSE( 4, 2, 10) -void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth) +void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, const int chroma_format_idc) { int mm_flags = av_get_cpu_flags(); @@ -368,7 +368,8 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth) c->h264_idct_add16 = ff_h264_idct_add16_8_mmx; c->h264_idct8_add4 = ff_h264_idct8_add4_8_mmx; - c->h264_idct_add8 = ff_h264_idct_add8_8_mmx; + if (chroma_format_idc == 1) + c->h264_idct_add8 = ff_h264_idct_add8_8_mmx; c->h264_idct_add16intra = ff_h264_idct_add16intra_8_mmx; c->h264_luma_dc_dequant_idct= ff_h264_luma_dc_dequant_idct_mmx; @@ -377,13 +378,16 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth) c->h264_idct8_dc_add = ff_h264_idct8_dc_add_8_mmx2; c->h264_idct_add16 = ff_h264_idct_add16_8_mmx2; c->h264_idct8_add4 = ff_h264_idct8_add4_8_mmx2; - c->h264_idct_add8 = ff_h264_idct_add8_8_mmx2; + if (chroma_format_idc == 1) + c->h264_idct_add8 = ff_h264_idct_add8_8_mmx2; c->h264_idct_add16intra= ff_h264_idct_add16intra_8_mmx2; c->h264_v_loop_filter_chroma= ff_deblock_v_chroma_8_mmxext; - c->h264_h_loop_filter_chroma= ff_deblock_h_chroma_8_mmxext; c->h264_v_loop_filter_chroma_intra= ff_deblock_v_chroma_intra_8_mmxext; - c->h264_h_loop_filter_chroma_intra= ff_deblock_h_chroma_intra_8_mmxext; + if (chroma_format_idc == 1) { + c->h264_h_loop_filter_chroma= ff_deblock_h_chroma_8_mmxext; + c->h264_h_loop_filter_chroma_intra= ff_deblock_h_chroma_intra_8_mmxext; + } #if ARCH_X86_32 c->h264_v_loop_filter_luma= ff_deblock_v_luma_8_mmxext; c->h264_h_loop_filter_luma= ff_deblock_h_luma_8_mmxext; @@ -413,7 +417,8 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth) c->h264_idct_add16 = ff_h264_idct_add16_8_sse2; c->h264_idct8_add4 = ff_h264_idct8_add4_8_sse2; - c->h264_idct_add8 = ff_h264_idct_add8_8_sse2; + if (chroma_format_idc == 1) + c->h264_idct_add8 = ff_h264_idct_add8_8_sse2; c->h264_idct_add16intra = ff_h264_idct_add16intra_8_sse2; c->h264_luma_dc_dequant_idct= ff_h264_luma_dc_dequant_idct_sse2; @@ -472,7 +477,8 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth) c->h264_idct8_dc_add = ff_h264_idct8_dc_add_10_sse2; c->h264_idct_add16 = ff_h264_idct_add16_10_sse2; - c->h264_idct_add8 = ff_h264_idct_add8_10_sse2; + if (chroma_format_idc == 1) + c->h264_idct_add8 = ff_h264_idct_add8_10_sse2; c->h264_idct_add16intra= ff_h264_idct_add16intra_10_sse2; #if HAVE_ALIGNED_STACK c->h264_idct8_add = ff_h264_idct8_add_10_sse2; @@ -532,7 +538,8 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth) c->h264_idct8_dc_add = ff_h264_idct8_dc_add_10_avx; c->h264_idct_add16 = ff_h264_idct_add16_10_avx; - c->h264_idct_add8 = ff_h264_idct_add8_10_avx; + if (chroma_format_idc == 1) + c->h264_idct_add8 = ff_h264_idct_add8_10_avx; c->h264_idct_add16intra= ff_h264_idct_add16intra_10_avx; #if HAVE_ALIGNED_STACK c->h264_idct8_add = ff_h264_idct8_add_10_avx; From 229d263cc914b5396847f7249fdda2e6ded9ec1b Mon Sep 17 00:00:00 2001 From: "Ronald S. Bultje" Date: Wed, 12 Oct 2011 08:55:37 -0700 Subject: [PATCH 06/35] Support for lossless and inter H264 4:2:2. --- libavcodec/h264.c | 66 ++++++++++++++++++++++++++++------ libavcodec/h264pred.c | 13 ++++--- libavcodec/h264pred_template.c | 39 +++++++++++++++++++- libavcodec/x86/h264dsp_mmx.c | 2 +- 4 files changed, 103 insertions(+), 17 deletions(-) diff --git a/libavcodec/h264.c b/libavcodec/h264.c index f61f524508..8d652f13ce 100644 --- a/libavcodec/h264.c +++ b/libavcodec/h264.c @@ -457,6 +457,7 @@ static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square, const int full_my= my>>2; const int pic_width = 16*s->mb_width; const int pic_height = 16*s->mb_height >> MB_FIELD; + int ysh; if(mx&7) extra_width -= 3; if(my&7) extra_height -= 3; @@ -465,7 +466,8 @@ static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square, || full_my < 0-extra_height || full_mx + 16/*FIXME*/ > pic_width + extra_width || full_my + 16/*FIXME*/ > pic_height + extra_height){ - s->dsp.emulated_edge_mc(s->edge_emu_buffer, src_y - (2 << pixel_shift) - 2*h->mb_linesize, h->mb_linesize, 16+5, 16+5/*FIXME*/, full_mx-2, full_my-2, pic_width, pic_height); + s->dsp.emulated_edge_mc(s->edge_emu_buffer, src_y - (2 << pixel_shift) - 2*h->mb_linesize, h->mb_linesize, + 16+5, 16+5/*FIXME*/, full_mx-2, full_my-2, pic_width, pic_height); src_y= s->edge_emu_buffer + (2 << pixel_shift) + 2*h->mb_linesize; emu=1; } @@ -502,25 +504,27 @@ static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square, return; } - if(MB_FIELD){ + ysh = 3 - !!(CHROMA422); + if(!CHROMA422 && MB_FIELD){ // chroma offset when predicting from a field of opposite parity my += 2 * ((s->mb_y & 1) - (pic->f.reference - 1)); emu |= (my>>3) < 0 || (my>>3) + 8 >= (pic_height>>1); } - src_cb = pic->f.data[1] + ((mx >> 3) << pixel_shift) + (my >> 3) * h->mb_uvlinesize; - src_cr = pic->f.data[2] + ((mx >> 3) << pixel_shift) + (my >> 3) * h->mb_uvlinesize; + + src_cb = pic->f.data[1] + ((mx >> 3) << pixel_shift) + (my >> ysh) * h->mb_uvlinesize; + src_cr = pic->f.data[2] + ((mx >> 3) << pixel_shift) + (my >> ysh) * h->mb_uvlinesize; if(emu){ - s->dsp.emulated_edge_mc(s->edge_emu_buffer, src_cb, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1); + s->dsp.emulated_edge_mc(s->edge_emu_buffer, src_cb, h->mb_uvlinesize, 9, (16>>!(CHROMA422)) + 1, (mx>>3), (my>>ysh), pic_width>>1, pic_height>>!(CHROMA422)); src_cb= s->edge_emu_buffer; } - chroma_op(dest_cb, src_cb, h->mb_uvlinesize, chroma_height, mx&7, my&7); + chroma_op(dest_cb, src_cb, h->mb_uvlinesize, chroma_height << !!(CHROMA422), mx&7, (my << !!(CHROMA422)) &7); if(emu){ - s->dsp.emulated_edge_mc(s->edge_emu_buffer, src_cr, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1); + s->dsp.emulated_edge_mc(s->edge_emu_buffer, src_cr, h->mb_uvlinesize, 9, (16>>!(CHROMA422)) + 1, (mx>>3), (my>>ysh), pic_width>>1, pic_height>>!(CHROMA422)); src_cr= s->edge_emu_buffer; } - chroma_op(dest_cr, src_cr, h->mb_uvlinesize, chroma_height, mx&7, my&7); + chroma_op(dest_cr, src_cr, h->mb_uvlinesize, chroma_height << !!(CHROMA422), mx&7, (my << !!(CHROMA422)) &7); } static inline void mc_part_std(H264Context *h, int n, int square, int chroma_height, int delta, @@ -537,6 +541,9 @@ static inline void mc_part_std(H264Context *h, int n, int square, int chroma_hei if(chroma444){ dest_cb += (2*x_offset << pixel_shift) + 2*y_offset*h->mb_linesize; dest_cr += (2*x_offset << pixel_shift) + 2*y_offset*h->mb_linesize; + } else if (CHROMA422) { + dest_cb += ( x_offset << pixel_shift) + 2*y_offset*h->mb_uvlinesize; + dest_cr += ( x_offset << pixel_shift) + 2*y_offset*h->mb_uvlinesize; }else{ dest_cb += ( x_offset << pixel_shift) + y_offset*h->mb_uvlinesize; dest_cr += ( x_offset << pixel_shift) + y_offset*h->mb_uvlinesize; @@ -577,6 +584,9 @@ static inline void mc_part_weighted(H264Context *h, int n, int square, int chrom chroma_weight_op = luma_weight_op; dest_cb += (2*x_offset << pixel_shift) + 2*y_offset*h->mb_linesize; dest_cr += (2*x_offset << pixel_shift) + 2*y_offset*h->mb_linesize; + } else if (CHROMA422) { + dest_cb += ( x_offset << pixel_shift) + 2*y_offset*h->mb_uvlinesize; + dest_cr += ( x_offset << pixel_shift) + 2*y_offset*h->mb_uvlinesize; }else{ dest_cb += ( x_offset << pixel_shift) + y_offset*h->mb_uvlinesize; dest_cr += ( x_offset << pixel_shift) + y_offset*h->mb_uvlinesize; @@ -606,6 +616,14 @@ static inline void mc_part_weighted(H264Context *h, int n, int square, int chrom luma_weight_avg( dest_y, tmp_y, h-> mb_linesize, 5, weight0, weight1, 0); chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, 5, weight0, weight1, 0); chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, 5, weight0, weight1, 0); + if (CHROMA422) { + chroma_weight_avg(dest_cb + chroma_height * h->mb_uvlinesize, + tmp_cb + chroma_height * h->mb_uvlinesize, + h->mb_uvlinesize, 5, weight0, weight1, 0); + chroma_weight_avg(dest_cr + chroma_height * h->mb_uvlinesize, + tmp_cr + chroma_height * h->mb_uvlinesize, + h->mb_uvlinesize, 5, weight0, weight1, 0); + } }else{ luma_weight_avg(dest_y, tmp_y, h->mb_linesize, h->luma_log2_weight_denom, h->luma_weight[refn0][0][0] , h->luma_weight[refn1][1][0], @@ -616,6 +634,18 @@ static inline void mc_part_weighted(H264Context *h, int n, int square, int chrom chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom, h->chroma_weight[refn0][0][1][0] , h->chroma_weight[refn1][1][1][0], h->chroma_weight[refn0][0][1][1] + h->chroma_weight[refn1][1][1][1]); + if (CHROMA422) { + chroma_weight_avg(dest_cb + chroma_height * h->mb_uvlinesize, + tmp_cb + chroma_height * h->mb_uvlinesize, + h->mb_uvlinesize, h->chroma_log2_weight_denom, + h->chroma_weight[refn0][0][0][0] , h->chroma_weight[refn1][1][0][0], + h->chroma_weight[refn0][0][0][1] + h->chroma_weight[refn1][1][0][1]); + chroma_weight_avg(dest_cr + chroma_height * h->mb_uvlinesize, + tmp_cr + chroma_height * h->mb_uvlinesize, + h->mb_uvlinesize, h->chroma_log2_weight_denom, + h->chroma_weight[refn0][0][1][0] , h->chroma_weight[refn1][1][1][0], + h->chroma_weight[refn0][0][1][1] + h->chroma_weight[refn1][1][1][1]); + } } }else{ int list = list1 ? 1 : 0; @@ -632,6 +662,14 @@ static inline void mc_part_weighted(H264Context *h, int n, int square, int chrom h->chroma_weight[refn][list][0][0], h->chroma_weight[refn][list][0][1]); chroma_weight_op(dest_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom, h->chroma_weight[refn][list][1][0], h->chroma_weight[refn][list][1][1]); + if (CHROMA422) { + chroma_weight_op(dest_cb + chroma_height * h->mb_uvlinesize, + h->mb_uvlinesize, h->chroma_log2_weight_denom, + h->chroma_weight[refn][list][0][0], h->chroma_weight[refn][list][0][1]); + chroma_weight_op(dest_cr + chroma_height * h->mb_uvlinesize, + h->mb_uvlinesize, h->chroma_log2_weight_denom, + h->chroma_weight[refn][list][1][0], h->chroma_weight[refn][list][1][1]); + } } } } @@ -1851,13 +1889,13 @@ static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple, i } if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){ if (!h->sps.chroma_format_idc) { - for (i = 0; i < 8; i++) { + for (i = 0; i < block_h; i++) { uint16_t *tmp_cb = (uint16_t*)(dest_cb + i*uvlinesize); for (j = 0; j < 8; j++) { tmp_cb[j] = 1 << (bit_depth - 1); } } - for (i = 0; i < 8; i++) { + for (i = 0; i < block_h; i++) { uint16_t *tmp_cr = (uint16_t*)(dest_cr + i*uvlinesize); for (j = 0; j < 8; j++) { tmp_cr[j] = 1 << (bit_depth - 1); @@ -1882,7 +1920,7 @@ static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple, i } if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){ if (!h->sps.chroma_format_idc) { - for (i = 0; i < 8; i++) { + for (i = 0; i < block_h; i++) { memset(dest_cb + i*uvlinesize, 128, 8); memset(dest_cr + i*uvlinesize, 128, 8); } @@ -1931,6 +1969,12 @@ static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple, i if(h->non_zero_count_cache[ scan8[i] ] || dctcoef_get(h->mb, pixel_shift, i*16)) idct_add (dest[j-1] + block_offset[i], h->mb + (i*16 << pixel_shift), uvlinesize); } + if (CHROMA422) { + for(i=j*16+4; inon_zero_count_cache[ scan8[i] ] || dctcoef_get(h->mb, pixel_shift, i*16)) + idct_add (dest[j-1] + block_offset[i+4], h->mb + (i*16 << pixel_shift), uvlinesize); + } + } } } }else{ diff --git a/libavcodec/h264pred.c b/libavcodec/h264pred.c index 17199d01e6..37a4cf1486 100644 --- a/libavcodec/h264pred.c +++ b/libavcodec/h264pred.c @@ -462,10 +462,10 @@ void ff_h264_pred_init(H264PredContext *h, int codec_id, const int bit_depth, co h->pred8x8[DC_PRED8x8 ]= FUNCC(pred8x16_dc , depth);\ h->pred8x8[LEFT_DC_PRED8x8]= FUNCC(pred8x16_left_dc , depth);\ h->pred8x8[TOP_DC_PRED8x8 ]= FUNCC(pred8x16_top_dc , depth);\ - h->pred8x8[ALZHEIMER_DC_L0T_PRED8x8 ]= FUNC(pred8x8_mad_cow_dc_l0t, depth);\ - h->pred8x8[ALZHEIMER_DC_0LT_PRED8x8 ]= FUNC(pred8x8_mad_cow_dc_0lt, depth);\ - h->pred8x8[ALZHEIMER_DC_L00_PRED8x8 ]= FUNC(pred8x8_mad_cow_dc_l00, depth);\ - h->pred8x8[ALZHEIMER_DC_0L0_PRED8x8 ]= FUNC(pred8x8_mad_cow_dc_0l0, depth);\ + h->pred8x8[ALZHEIMER_DC_L0T_PRED8x8 ]= FUNC(pred8x16_mad_cow_dc_l0t, depth);\ + h->pred8x8[ALZHEIMER_DC_0LT_PRED8x8 ]= FUNC(pred8x16_mad_cow_dc_0lt, depth);\ + h->pred8x8[ALZHEIMER_DC_L00_PRED8x8 ]= FUNC(pred8x16_mad_cow_dc_l00, depth);\ + h->pred8x8[ALZHEIMER_DC_0L0_PRED8x8 ]= FUNC(pred8x16_mad_cow_dc_0l0, depth);\ }\ }else{\ h->pred8x8[DC_PRED8x8 ]= FUNCD(pred8x8_dc_rv40);\ @@ -510,8 +510,13 @@ void ff_h264_pred_init(H264PredContext *h, int codec_id, const int bit_depth, co h->pred4x4_add [ HOR_PRED ]= FUNCC(pred4x4_horizontal_add , depth);\ h->pred8x8l_add [VERT_PRED ]= FUNCC(pred8x8l_vertical_add , depth);\ h->pred8x8l_add [ HOR_PRED ]= FUNCC(pred8x8l_horizontal_add , depth);\ + if (chroma_format_idc == 1) {\ h->pred8x8_add [VERT_PRED8x8]= FUNCC(pred8x8_vertical_add , depth);\ h->pred8x8_add [ HOR_PRED8x8]= FUNCC(pred8x8_horizontal_add , depth);\ + } else {\ + h->pred8x8_add [VERT_PRED8x8]= FUNCC(pred8x16_vertical_add , depth);\ + h->pred8x8_add [ HOR_PRED8x8]= FUNCC(pred8x16_horizontal_add , depth);\ + }\ h->pred16x16_add[VERT_PRED8x8]= FUNCC(pred16x16_vertical_add , depth);\ h->pred16x16_add[ HOR_PRED8x8]= FUNCC(pred16x16_horizontal_add , depth);\ diff --git a/libavcodec/h264pred_template.c b/libavcodec/h264pred_template.c index d4f654e18c..318b56196d 100644 --- a/libavcodec/h264pred_template.c +++ b/libavcodec/h264pred_template.c @@ -657,29 +657,50 @@ static void FUNCC(pred8x16_dc)(uint8_t *_src, int stride){ } } -//the following 4 function should not be optimized! static void FUNC(pred8x8_mad_cow_dc_l0t)(uint8_t *src, int stride){ FUNCC(pred8x8_top_dc)(src, stride); FUNCC(pred4x4_dc)(src, NULL, stride); } +static void FUNC(pred8x16_mad_cow_dc_l0t)(uint8_t *src, int stride){ + FUNCC(pred8x16_top_dc)(src, stride); + FUNCC(pred4x4_dc)(src, NULL, stride); +} + static void FUNC(pred8x8_mad_cow_dc_0lt)(uint8_t *src, int stride){ FUNCC(pred8x8_dc)(src, stride); FUNCC(pred4x4_top_dc)(src, NULL, stride); } +static void FUNC(pred8x16_mad_cow_dc_0lt)(uint8_t *src, int stride){ + FUNCC(pred8x16_dc)(src, stride); + FUNCC(pred4x4_top_dc)(src, NULL, stride); +} + static void FUNC(pred8x8_mad_cow_dc_l00)(uint8_t *src, int stride){ FUNCC(pred8x8_left_dc)(src, stride); FUNCC(pred4x4_128_dc)(src + 4*stride , NULL, stride); FUNCC(pred4x4_128_dc)(src + 4*stride + 4*sizeof(pixel), NULL, stride); } +static void FUNC(pred8x16_mad_cow_dc_l00)(uint8_t *src, int stride){ + FUNCC(pred8x16_left_dc)(src, stride); + FUNCC(pred4x4_128_dc)(src + 4*stride , NULL, stride); + FUNCC(pred4x4_128_dc)(src + 4*stride + 4*sizeof(pixel), NULL, stride); +} + static void FUNC(pred8x8_mad_cow_dc_0l0)(uint8_t *src, int stride){ FUNCC(pred8x8_left_dc)(src, stride); FUNCC(pred4x4_128_dc)(src , NULL, stride); FUNCC(pred4x4_128_dc)(src + 4*sizeof(pixel), NULL, stride); } +static void FUNC(pred8x16_mad_cow_dc_0l0)(uint8_t *src, int stride){ + FUNCC(pred8x16_left_dc)(src, stride); + FUNCC(pred4x4_128_dc)(src , NULL, stride); + FUNCC(pred4x4_128_dc)(src + 4*sizeof(pixel), NULL, stride); +} + static void FUNCC(pred8x8_plane)(uint8_t *_src, int _stride){ int j, k; int a; @@ -1126,8 +1147,24 @@ static void FUNCC(pred8x8_vertical_add)(uint8_t *pix, const int *block_offset, c FUNCC(pred4x4_vertical_add)(pix + block_offset[i], block + i*16*sizeof(pixel), stride); } +static void FUNCC(pred8x16_vertical_add)(uint8_t *pix, const int *block_offset, const DCTELEM *block, int stride){ + int i; + for(i=0; i<4; i++) + FUNCC(pred4x4_vertical_add)(pix + block_offset[i], block + i*16*sizeof(pixel), stride); + for(i=4; i<8; i++) + FUNCC(pred4x4_vertical_add)(pix + block_offset[i+4], block + i*16*sizeof(pixel), stride); +} + static void FUNCC(pred8x8_horizontal_add)(uint8_t *pix, const int *block_offset, const DCTELEM *block, int stride){ int i; for(i=0; i<4; i++) FUNCC(pred4x4_horizontal_add)(pix + block_offset[i], block + i*16*sizeof(pixel), stride); } + +static void FUNCC(pred8x16_horizontal_add)(uint8_t *pix, const int *block_offset, const DCTELEM *block, int stride){ + int i; + for(i=0; i<4; i++) + FUNCC(pred4x4_horizontal_add)(pix + block_offset[i], block + i*16*sizeof(pixel), stride); + for(i=4; i<8; i++) + FUNCC(pred4x4_horizontal_add)(pix + block_offset[i+4], block + i*16*sizeof(pixel), stride); +} diff --git a/libavcodec/x86/h264dsp_mmx.c b/libavcodec/x86/h264dsp_mmx.c index 910ad8401f..06ee7cad43 100644 --- a/libavcodec/x86/h264dsp_mmx.c +++ b/libavcodec/x86/h264dsp_mmx.c @@ -354,7 +354,7 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, const int chrom { int mm_flags = av_get_cpu_flags(); - if (mm_flags & AV_CPU_FLAG_MMX2) { + if (chroma_format_idc == 1 && mm_flags & AV_CPU_FLAG_MMX2) { c->h264_loop_filter_strength= h264_loop_filter_strength_mmx2; } From c2d337429c7c87ee559efe54dbc0f84f2a25c3a4 Mon Sep 17 00:00:00 2001 From: "Ronald S. Bultje" Date: Fri, 21 Oct 2011 00:00:39 -0700 Subject: [PATCH 07/35] H264: change weight/biweight functions to take a height argument. Neon parts by Mans Rullgard . --- libavcodec/arm/h264dsp_init_arm.c | 77 +++------- libavcodec/arm/h264dsp_neon.S | 86 ++++------- libavcodec/h264.c | 126 +++++++--------- libavcodec/h264dsp.c | 28 +--- libavcodec/h264dsp.h | 10 +- libavcodec/h264dsp_template.c | 28 ++-- libavcodec/ppc/h264_altivec.c | 44 +++--- libavcodec/x86/h264_weight.asm | 210 ++++++++++----------------- libavcodec/x86/h264_weight_10bit.asm | 145 +++++++----------- libavcodec/x86/h264dsp_mmx.c | 175 ++++++++-------------- 10 files changed, 337 insertions(+), 592 deletions(-) diff --git a/libavcodec/arm/h264dsp_init_arm.c b/libavcodec/arm/h264dsp_init_arm.c index c1ca217add..1c331a495d 100644 --- a/libavcodec/arm/h264dsp_init_arm.c +++ b/libavcodec/arm/h264dsp_init_arm.c @@ -32,47 +32,22 @@ void ff_h264_v_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha, void ff_h264_h_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0); -void ff_weight_h264_pixels_16x16_neon(uint8_t *ds, int stride, int log2_den, - int weight, int offset); -void ff_weight_h264_pixels_16x8_neon(uint8_t *ds, int stride, int log2_den, - int weight, int offset); -void ff_weight_h264_pixels_8x16_neon(uint8_t *ds, int stride, int log2_den, - int weight, int offset); -void ff_weight_h264_pixels_8x8_neon(uint8_t *ds, int stride, int log2_den, - int weight, int offset); -void ff_weight_h264_pixels_8x4_neon(uint8_t *ds, int stride, int log2_den, - int weight, int offset); -void ff_weight_h264_pixels_4x8_neon(uint8_t *ds, int stride, int log2_den, - int weight, int offset); -void ff_weight_h264_pixels_4x4_neon(uint8_t *ds, int stride, int log2_den, - int weight, int offset); -void ff_weight_h264_pixels_4x2_neon(uint8_t *ds, int stride, int log2_den, - int weight, int offset); +void ff_weight_h264_pixels_16_neon(uint8_t *dst, int stride, int height, + int log2_den, int weight, int offset); +void ff_weight_h264_pixels_8_neon(uint8_t *dst, int stride, int height, + int log2_den, int weight, int offset); +void ff_weight_h264_pixels_4_neon(uint8_t *dst, int stride, int height, + int log2_den, int weight, int offset); -void ff_biweight_h264_pixels_16x16_neon(uint8_t *dst, uint8_t *src, int stride, - int log2_den, int weightd, int weights, - int offset); -void ff_biweight_h264_pixels_16x8_neon(uint8_t *dst, uint8_t *src, int stride, - int log2_den, int weightd, int weights, - int offset); -void ff_biweight_h264_pixels_8x16_neon(uint8_t *dst, uint8_t *src, int stride, - int log2_den, int weightd, int weights, - int offset); -void ff_biweight_h264_pixels_8x8_neon(uint8_t *dst, uint8_t *src, int stride, - int log2_den, int weightd, int weights, - int offset); -void ff_biweight_h264_pixels_8x4_neon(uint8_t *dst, uint8_t *src, int stride, - int log2_den, int weightd, int weights, - int offset); -void ff_biweight_h264_pixels_4x8_neon(uint8_t *dst, uint8_t *src, int stride, - int log2_den, int weightd, int weights, - int offset); -void ff_biweight_h264_pixels_4x4_neon(uint8_t *dst, uint8_t *src, int stride, - int log2_den, int weightd, int weights, - int offset); -void ff_biweight_h264_pixels_4x2_neon(uint8_t *dst, uint8_t *src, int stride, - int log2_den, int weightd, int weights, - int offset); +void ff_biweight_h264_pixels_16_neon(uint8_t *dst, uint8_t *src, int stride, + int height, int log2_den, int weightd, + int weights, int offset); +void ff_biweight_h264_pixels_8_neon(uint8_t *dst, uint8_t *src, int stride, + int height, int log2_den, int weightd, + int weights, int offset); +void ff_biweight_h264_pixels_4_neon(uint8_t *dst, uint8_t *src, int stride, + int height, int log2_den, int weightd, + int weights, int offset); void ff_h264_idct_add_neon(uint8_t *dst, DCTELEM *block, int stride); void ff_h264_idct_dc_add_neon(uint8_t *dst, DCTELEM *block, int stride); @@ -100,23 +75,13 @@ static void ff_h264dsp_init_neon(H264DSPContext *c, const int bit_depth, const i c->h264_v_loop_filter_chroma = ff_h264_v_loop_filter_chroma_neon; c->h264_h_loop_filter_chroma = ff_h264_h_loop_filter_chroma_neon; - c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels_16x16_neon; - c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels_16x8_neon; - c->weight_h264_pixels_tab[2] = ff_weight_h264_pixels_8x16_neon; - c->weight_h264_pixels_tab[3] = ff_weight_h264_pixels_8x8_neon; - c->weight_h264_pixels_tab[4] = ff_weight_h264_pixels_8x4_neon; - c->weight_h264_pixels_tab[5] = ff_weight_h264_pixels_4x8_neon; - c->weight_h264_pixels_tab[6] = ff_weight_h264_pixels_4x4_neon; - c->weight_h264_pixels_tab[7] = ff_weight_h264_pixels_4x2_neon; + c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels_16_neon; + c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels_8_neon; + c->weight_h264_pixels_tab[2] = ff_weight_h264_pixels_4_neon; - c->biweight_h264_pixels_tab[0] = ff_biweight_h264_pixels_16x16_neon; - c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels_16x8_neon; - c->biweight_h264_pixels_tab[2] = ff_biweight_h264_pixels_8x16_neon; - c->biweight_h264_pixels_tab[3] = ff_biweight_h264_pixels_8x8_neon; - c->biweight_h264_pixels_tab[4] = ff_biweight_h264_pixels_8x4_neon; - c->biweight_h264_pixels_tab[5] = ff_biweight_h264_pixels_4x8_neon; - c->biweight_h264_pixels_tab[6] = ff_biweight_h264_pixels_4x4_neon; - c->biweight_h264_pixels_tab[7] = ff_biweight_h264_pixels_4x2_neon; + c->biweight_h264_pixels_tab[0] = ff_biweight_h264_pixels_16_neon; + c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels_8_neon; + c->biweight_h264_pixels_tab[2] = ff_biweight_h264_pixels_4_neon; c->h264_idct_add = ff_h264_idct_add_neon; c->h264_idct_dc_add = ff_h264_idct_dc_add_neon; diff --git a/libavcodec/arm/h264dsp_neon.S b/libavcodec/arm/h264dsp_neon.S index 0fa4a6b0a5..3d2c6746ae 100644 --- a/libavcodec/arm/h264dsp_neon.S +++ b/libavcodec/arm/h264dsp_neon.S @@ -1592,7 +1592,7 @@ endfunc vdup.8 d1, r5 vmov q2, q8 vmov q3, q8 -1: subs ip, ip, #2 +1: subs r3, r3, #2 vld1.8 {d20-d21},[r0,:128], r2 \macd q2, d0, d20 pld [r0] @@ -1632,7 +1632,7 @@ endfunc vdup.8 d1, r5 vmov q1, q8 vmov q10, q8 -1: subs ip, ip, #2 +1: subs r3, r3, #2 vld1.8 {d4},[r0,:64], r2 \macd q1, d0, d4 pld [r0] @@ -1662,7 +1662,7 @@ endfunc vdup.8 d1, r5 vmov q1, q8 vmov q10, q8 -1: subs ip, ip, #4 +1: subs r3, r3, #4 vld1.32 {d4[0]},[r0,:32], r2 vld1.32 {d4[1]},[r0,:32], r2 \macd q1, d0, d4 @@ -1700,16 +1700,17 @@ endfunc .endm .macro biweight_func w -function biweight_h264_pixels_\w\()_neon +function ff_biweight_h264_pixels_\w\()_neon, export=1 push {r4-r6, lr} - add r4, sp, #16 + ldr r12, [sp, #16] + add r4, sp, #20 ldm r4, {r4-r6} lsr lr, r4, #31 add r6, r6, #1 eors lr, lr, r5, lsr #30 orr r6, r6, #1 - vdup.16 q9, r3 - lsl r6, r6, r3 + vdup.16 q9, r12 + lsl r6, r6, r12 vmvn q9, q9 vdup.16 q8, r6 mov r6, r0 @@ -1730,34 +1731,15 @@ function biweight_h264_pixels_\w\()_neon endfunc .endm - .macro biweight_entry w, h, b=1 -function ff_biweight_h264_pixels_\w\()x\h\()_neon, export=1 - mov ip, #\h -.if \b - b biweight_h264_pixels_\w\()_neon -.endif -endfunc - .endm - - biweight_entry 16, 8 - biweight_entry 16, 16, b=0 biweight_func 16 - - biweight_entry 8, 16 - biweight_entry 8, 4 - biweight_entry 8, 8, b=0 biweight_func 8 - - biweight_entry 4, 8 - biweight_entry 4, 2 - biweight_entry 4, 4, b=0 biweight_func 4 @ Weighted prediction .macro weight_16 add - vdup.8 d0, r3 -1: subs ip, ip, #2 + vdup.8 d0, r12 +1: subs r2, r2, #2 vld1.8 {d20-d21},[r0,:128], r1 vmull.u8 q2, d0, d20 pld [r0] @@ -1785,8 +1767,8 @@ endfunc .endm .macro weight_8 add - vdup.8 d0, r3 -1: subs ip, ip, #2 + vdup.8 d0, r12 +1: subs r2, r2, #2 vld1.8 {d4},[r0,:64], r1 vmull.u8 q1, d0, d4 pld [r0] @@ -1806,10 +1788,10 @@ endfunc .endm .macro weight_4 add - vdup.8 d0, r3 + vdup.8 d0, r12 vmov q1, q8 vmov q10, q8 -1: subs ip, ip, #4 +1: subs r2, r2, #4 vld1.32 {d4[0]},[r0,:32], r1 vld1.32 {d4[1]},[r0,:32], r1 vmull.u8 q1, d0, d4 @@ -1842,50 +1824,32 @@ endfunc .endm .macro weight_func w -function weight_h264_pixels_\w\()_neon +function ff_weight_h264_pixels_\w\()_neon, export=1 push {r4, lr} - ldr r4, [sp, #8] - cmp r2, #1 - lsl r4, r4, r2 + ldr r12, [sp, #8] + ldr r4, [sp, #12] + cmp r3, #1 + lsl r4, r4, r3 vdup.16 q8, r4 mov r4, r0 ble 20f - rsb lr, r2, #1 + rsb lr, r3, #1 vdup.16 q9, lr - cmp r3, #0 + cmp r12, #0 blt 10f weight_\w vhadd.s16 -10: rsb r3, r3, #0 +10: rsb r12, r12, #0 weight_\w vhsub.s16 -20: rsb lr, r2, #0 +20: rsb lr, r3, #0 vdup.16 q9, lr - cmp r3, #0 + cmp r12, #0 blt 10f weight_\w vadd.s16 -10: rsb r3, r3, #0 +10: rsb r12, r12, #0 weight_\w vsub.s16 endfunc .endm - .macro weight_entry w, h, b=1 -function ff_weight_h264_pixels_\w\()x\h\()_neon, export=1 - mov ip, #\h -.if \b - b weight_h264_pixels_\w\()_neon -.endif -endfunc - .endm - - weight_entry 16, 8 - weight_entry 16, 16, b=0 weight_func 16 - - weight_entry 8, 16 - weight_entry 8, 4 - weight_entry 8, 8, b=0 weight_func 8 - - weight_entry 4, 8 - weight_entry 4, 2 - weight_entry 4, 4, b=0 weight_func 4 diff --git a/libavcodec/h264.c b/libavcodec/h264.c index 8d652f13ce..7306828197 100644 --- a/libavcodec/h264.c +++ b/libavcodec/h264.c @@ -438,7 +438,8 @@ static void chroma_dc_dct_c(DCTELEM *block){ } #endif -static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square, int chroma_height, int delta, int list, +static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square, + int height, int delta, int list, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, int src_x_offset, int src_y_offset, qpel_mc_func *qpix_op, h264_chroma_mc_func chroma_op, @@ -518,16 +519,16 @@ static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square, s->dsp.emulated_edge_mc(s->edge_emu_buffer, src_cb, h->mb_uvlinesize, 9, (16>>!(CHROMA422)) + 1, (mx>>3), (my>>ysh), pic_width>>1, pic_height>>!(CHROMA422)); src_cb= s->edge_emu_buffer; } - chroma_op(dest_cb, src_cb, h->mb_uvlinesize, chroma_height << !!(CHROMA422), mx&7, (my << !!(CHROMA422)) &7); + chroma_op(dest_cb, src_cb, h->mb_uvlinesize, height >> !(CHROMA422), mx&7, (my << !!(CHROMA422)) &7); if(emu){ s->dsp.emulated_edge_mc(s->edge_emu_buffer, src_cr, h->mb_uvlinesize, 9, (16>>!(CHROMA422)) + 1, (mx>>3), (my>>ysh), pic_width>>1, pic_height>>!(CHROMA422)); src_cr= s->edge_emu_buffer; } - chroma_op(dest_cr, src_cr, h->mb_uvlinesize, chroma_height << !!(CHROMA422), mx&7, (my << !!(CHROMA422)) &7); + chroma_op(dest_cr, src_cr, h->mb_uvlinesize, height >> !(CHROMA422), mx&7, (my << !!(CHROMA422)) &7); } -static inline void mc_part_std(H264Context *h, int n, int square, int chroma_height, int delta, +static inline void mc_part_std(H264Context *h, int n, int square, int height, int delta, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, int x_offset, int y_offset, qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put, @@ -553,7 +554,7 @@ static inline void mc_part_std(H264Context *h, int n, int square, int chroma_hei if(list0){ Picture *ref= &h->ref_list[0][ h->ref_cache[0][ scan8[n] ] ]; - mc_dir_part(h, ref, n, square, chroma_height, delta, 0, + mc_dir_part(h, ref, n, square, height, delta, 0, dest_y, dest_cb, dest_cr, x_offset, y_offset, qpix_op, chroma_op, pixel_shift, chroma444); @@ -563,13 +564,13 @@ static inline void mc_part_std(H264Context *h, int n, int square, int chroma_hei if(list1){ Picture *ref= &h->ref_list[1][ h->ref_cache[1][ scan8[n] ] ]; - mc_dir_part(h, ref, n, square, chroma_height, delta, 1, + mc_dir_part(h, ref, n, square, height, delta, 1, dest_y, dest_cb, dest_cr, x_offset, y_offset, qpix_op, chroma_op, pixel_shift, chroma444); } } -static inline void mc_part_weighted(H264Context *h, int n, int square, int chroma_height, int delta, +static inline void mc_part_weighted(H264Context *h, int n, int square, int height, int delta, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, int x_offset, int y_offset, qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put, @@ -577,17 +578,21 @@ static inline void mc_part_weighted(H264Context *h, int n, int square, int chrom h264_biweight_func luma_weight_avg, h264_biweight_func chroma_weight_avg, int list0, int list1, int pixel_shift, int chroma444){ MpegEncContext * const s = &h->s; + int chroma_height; dest_y += (2*x_offset << pixel_shift) + 2*y_offset*h->mb_linesize; if(chroma444){ + chroma_height = height; chroma_weight_avg = luma_weight_avg; chroma_weight_op = luma_weight_op; dest_cb += (2*x_offset << pixel_shift) + 2*y_offset*h->mb_linesize; dest_cr += (2*x_offset << pixel_shift) + 2*y_offset*h->mb_linesize; } else if (CHROMA422) { + chroma_height = height; dest_cb += ( x_offset << pixel_shift) + 2*y_offset*h->mb_uvlinesize; dest_cr += ( x_offset << pixel_shift) + 2*y_offset*h->mb_uvlinesize; }else{ + chroma_height = height >> 1; dest_cb += ( x_offset << pixel_shift) + y_offset*h->mb_uvlinesize; dest_cr += ( x_offset << pixel_shift) + y_offset*h->mb_uvlinesize; } @@ -603,78 +608,53 @@ static inline void mc_part_weighted(H264Context *h, int n, int square, int chrom int refn0 = h->ref_cache[0][ scan8[n] ]; int refn1 = h->ref_cache[1][ scan8[n] ]; - mc_dir_part(h, &h->ref_list[0][refn0], n, square, chroma_height, delta, 0, + mc_dir_part(h, &h->ref_list[0][refn0], n, square, height, delta, 0, dest_y, dest_cb, dest_cr, x_offset, y_offset, qpix_put, chroma_put, pixel_shift, chroma444); - mc_dir_part(h, &h->ref_list[1][refn1], n, square, chroma_height, delta, 1, + mc_dir_part(h, &h->ref_list[1][refn1], n, square, height, delta, 1, tmp_y, tmp_cb, tmp_cr, x_offset, y_offset, qpix_put, chroma_put, pixel_shift, chroma444); if(h->use_weight == 2){ int weight0 = h->implicit_weight[refn0][refn1][s->mb_y&1]; int weight1 = 64 - weight0; - luma_weight_avg( dest_y, tmp_y, h-> mb_linesize, 5, weight0, weight1, 0); - chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, 5, weight0, weight1, 0); - chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, 5, weight0, weight1, 0); - if (CHROMA422) { - chroma_weight_avg(dest_cb + chroma_height * h->mb_uvlinesize, - tmp_cb + chroma_height * h->mb_uvlinesize, - h->mb_uvlinesize, 5, weight0, weight1, 0); - chroma_weight_avg(dest_cr + chroma_height * h->mb_uvlinesize, - tmp_cr + chroma_height * h->mb_uvlinesize, - h->mb_uvlinesize, 5, weight0, weight1, 0); - } + luma_weight_avg( dest_y, tmp_y, h-> mb_linesize, + height, 5, weight0, weight1, 0); + chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, + chroma_height, 5, weight0, weight1, 0); + chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, + chroma_height, 5, weight0, weight1, 0); }else{ - luma_weight_avg(dest_y, tmp_y, h->mb_linesize, h->luma_log2_weight_denom, + luma_weight_avg(dest_y, tmp_y, h->mb_linesize, height, h->luma_log2_weight_denom, h->luma_weight[refn0][0][0] , h->luma_weight[refn1][1][0], h->luma_weight[refn0][0][1] + h->luma_weight[refn1][1][1]); - chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom, + chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, chroma_height, h->chroma_log2_weight_denom, h->chroma_weight[refn0][0][0][0] , h->chroma_weight[refn1][1][0][0], h->chroma_weight[refn0][0][0][1] + h->chroma_weight[refn1][1][0][1]); - chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom, + chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, chroma_height, h->chroma_log2_weight_denom, h->chroma_weight[refn0][0][1][0] , h->chroma_weight[refn1][1][1][0], h->chroma_weight[refn0][0][1][1] + h->chroma_weight[refn1][1][1][1]); - if (CHROMA422) { - chroma_weight_avg(dest_cb + chroma_height * h->mb_uvlinesize, - tmp_cb + chroma_height * h->mb_uvlinesize, - h->mb_uvlinesize, h->chroma_log2_weight_denom, - h->chroma_weight[refn0][0][0][0] , h->chroma_weight[refn1][1][0][0], - h->chroma_weight[refn0][0][0][1] + h->chroma_weight[refn1][1][0][1]); - chroma_weight_avg(dest_cr + chroma_height * h->mb_uvlinesize, - tmp_cr + chroma_height * h->mb_uvlinesize, - h->mb_uvlinesize, h->chroma_log2_weight_denom, - h->chroma_weight[refn0][0][1][0] , h->chroma_weight[refn1][1][1][0], - h->chroma_weight[refn0][0][1][1] + h->chroma_weight[refn1][1][1][1]); - } } }else{ int list = list1 ? 1 : 0; int refn = h->ref_cache[list][ scan8[n] ]; Picture *ref= &h->ref_list[list][refn]; - mc_dir_part(h, ref, n, square, chroma_height, delta, list, + mc_dir_part(h, ref, n, square, height, delta, list, dest_y, dest_cb, dest_cr, x_offset, y_offset, qpix_put, chroma_put, pixel_shift, chroma444); - luma_weight_op(dest_y, h->mb_linesize, h->luma_log2_weight_denom, + luma_weight_op(dest_y, h->mb_linesize, height, h->luma_log2_weight_denom, h->luma_weight[refn][list][0], h->luma_weight[refn][list][1]); if(h->use_weight_chroma){ - chroma_weight_op(dest_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom, + chroma_weight_op(dest_cb, h->mb_uvlinesize, chroma_height, h->chroma_log2_weight_denom, h->chroma_weight[refn][list][0][0], h->chroma_weight[refn][list][0][1]); - chroma_weight_op(dest_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom, + chroma_weight_op(dest_cr, h->mb_uvlinesize, chroma_height, h->chroma_log2_weight_denom, h->chroma_weight[refn][list][1][0], h->chroma_weight[refn][list][1][1]); - if (CHROMA422) { - chroma_weight_op(dest_cb + chroma_height * h->mb_uvlinesize, - h->mb_uvlinesize, h->chroma_log2_weight_denom, - h->chroma_weight[refn][list][0][0], h->chroma_weight[refn][list][0][1]); - chroma_weight_op(dest_cr + chroma_height * h->mb_uvlinesize, - h->mb_uvlinesize, h->chroma_log2_weight_denom, - h->chroma_weight[refn][list][1][0], h->chroma_weight[refn][list][1][1]); - } } } } -static inline void mc_part(H264Context *h, int n, int square, int chroma_height, int delta, +static inline void mc_part(H264Context *h, int n, int square, int height, int delta, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, int x_offset, int y_offset, qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put, @@ -684,12 +664,12 @@ static inline void mc_part(H264Context *h, int n, int square, int chroma_height, if((h->use_weight==2 && list0 && list1 && (h->implicit_weight[ h->ref_cache[0][scan8[n]] ][ h->ref_cache[1][scan8[n]] ][h->s.mb_y&1] != 32)) || h->use_weight==1) - mc_part_weighted(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr, + mc_part_weighted(h, n, square, height, delta, dest_y, dest_cb, dest_cr, x_offset, y_offset, qpix_put, chroma_put, - weight_op[0], weight_op[3], weight_avg[0], - weight_avg[3], list0, list1, pixel_shift, chroma444); + weight_op[0], weight_op[1], weight_avg[0], + weight_avg[1], list0, list1, pixel_shift, chroma444); else - mc_part_std(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr, + mc_part_std(h, n, square, height, delta, dest_y, dest_cb, dest_cr, x_offset, y_offset, qpix_put, chroma_put, qpix_avg, chroma_avg, list0, list1, pixel_shift, chroma444); } @@ -731,31 +711,31 @@ static av_always_inline void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t prefetch_motion(h, 0, pixel_shift, chroma444); if(IS_16X16(mb_type)){ - mc_part(h, 0, 1, 8, 0, dest_y, dest_cb, dest_cr, 0, 0, + mc_part(h, 0, 1, 16, 0, dest_y, dest_cb, dest_cr, 0, 0, qpix_put[0], chroma_put[0], qpix_avg[0], chroma_avg[0], weight_op, weight_avg, IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1), pixel_shift, chroma444); }else if(IS_16X8(mb_type)){ - mc_part(h, 0, 0, 4, 8 << pixel_shift, dest_y, dest_cb, dest_cr, 0, 0, + mc_part(h, 0, 0, 8, 8 << pixel_shift, dest_y, dest_cb, dest_cr, 0, 0, qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0], - &weight_op[1], &weight_avg[1], + weight_op, weight_avg, IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1), pixel_shift, chroma444); - mc_part(h, 8, 0, 4, 8 << pixel_shift, dest_y, dest_cb, dest_cr, 0, 4, + mc_part(h, 8, 0, 8, 8 << pixel_shift, dest_y, dest_cb, dest_cr, 0, 4, qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0], - &weight_op[1], &weight_avg[1], + weight_op, weight_avg, IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1), pixel_shift, chroma444); }else if(IS_8X16(mb_type)){ - mc_part(h, 0, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 0, 0, + mc_part(h, 0, 0, 16, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 0, 0, qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1], - &weight_op[2], &weight_avg[2], + &weight_op[1], &weight_avg[1], IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1), pixel_shift, chroma444); - mc_part(h, 4, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 4, 0, + mc_part(h, 4, 0, 16, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 4, 0, qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1], - &weight_op[2], &weight_avg[2], + &weight_op[1], &weight_avg[1], IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1), pixel_shift, chroma444); }else{ @@ -770,31 +750,31 @@ static av_always_inline void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t int y_offset= (i&2)<<1; if(IS_SUB_8X8(sub_mb_type)){ - mc_part(h, n, 1, 4, 0, dest_y, dest_cb, dest_cr, x_offset, y_offset, + mc_part(h, n, 1, 8, 0, dest_y, dest_cb, dest_cr, x_offset, y_offset, qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1], - &weight_op[3], &weight_avg[3], + &weight_op[1], &weight_avg[1], IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1), pixel_shift, chroma444); }else if(IS_SUB_8X4(sub_mb_type)){ - mc_part(h, n , 0, 2, 4 << pixel_shift, dest_y, dest_cb, dest_cr, x_offset, y_offset, + mc_part(h, n , 0, 4, 4 << pixel_shift, dest_y, dest_cb, dest_cr, x_offset, y_offset, qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1], - &weight_op[4], &weight_avg[4], + &weight_op[1], &weight_avg[1], IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1), pixel_shift, chroma444); - mc_part(h, n+2, 0, 2, 4 << pixel_shift, dest_y, dest_cb, dest_cr, x_offset, y_offset+2, + mc_part(h, n+2, 0, 4, 4 << pixel_shift, dest_y, dest_cb, dest_cr, x_offset, y_offset+2, qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1], - &weight_op[4], &weight_avg[4], + &weight_op[1], &weight_avg[1], IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1), pixel_shift, chroma444); }else if(IS_SUB_4X8(sub_mb_type)){ - mc_part(h, n , 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset, y_offset, + mc_part(h, n , 0, 8, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset, y_offset, qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2], - &weight_op[5], &weight_avg[5], + &weight_op[2], &weight_avg[2], IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1), pixel_shift, chroma444); - mc_part(h, n+1, 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset+2, y_offset, + mc_part(h, n+1, 0, 8, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset+2, y_offset, qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2], - &weight_op[5], &weight_avg[5], + &weight_op[2], &weight_avg[2], IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1), pixel_shift, chroma444); }else{ @@ -803,9 +783,9 @@ static av_always_inline void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t for(j=0; j<4; j++){ int sub_x_offset= x_offset + 2*(j&1); int sub_y_offset= y_offset + (j&2); - mc_part(h, n+j, 1, 2, 0, dest_y, dest_cb, dest_cr, sub_x_offset, sub_y_offset, + mc_part(h, n+j, 1, 4, 0, dest_y, dest_cb, dest_cr, sub_x_offset, sub_y_offset, qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2], - &weight_op[6], &weight_avg[6], + &weight_op[2], &weight_avg[2], IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1), pixel_shift, chroma444); } diff --git a/libavcodec/h264dsp.c b/libavcodec/h264dsp.c index 19ad2db3d9..ba967079fb 100644 --- a/libavcodec/h264dsp.c +++ b/libavcodec/h264dsp.c @@ -64,26 +64,14 @@ void ff_h264dsp_init(H264DSPContext *c, const int bit_depth, const int chroma_fo else\ c->h264_chroma_dc_dequant_idct= FUNC(ff_h264_chroma422_dc_dequant_idct, depth);\ \ - c->weight_h264_pixels_tab[0]= FUNC(weight_h264_pixels16x16, depth);\ - c->weight_h264_pixels_tab[1]= FUNC(weight_h264_pixels16x8, depth);\ - c->weight_h264_pixels_tab[2]= FUNC(weight_h264_pixels8x16, depth);\ - c->weight_h264_pixels_tab[3]= FUNC(weight_h264_pixels8x8, depth);\ - c->weight_h264_pixels_tab[4]= FUNC(weight_h264_pixels8x4, depth);\ - c->weight_h264_pixels_tab[5]= FUNC(weight_h264_pixels4x8, depth);\ - c->weight_h264_pixels_tab[6]= FUNC(weight_h264_pixels4x4, depth);\ - c->weight_h264_pixels_tab[7]= FUNC(weight_h264_pixels4x2, depth);\ - c->weight_h264_pixels_tab[8]= FUNC(weight_h264_pixels2x4, depth);\ - c->weight_h264_pixels_tab[9]= FUNC(weight_h264_pixels2x2, depth);\ - c->biweight_h264_pixels_tab[0]= FUNC(biweight_h264_pixels16x16, depth);\ - c->biweight_h264_pixels_tab[1]= FUNC(biweight_h264_pixels16x8, depth);\ - c->biweight_h264_pixels_tab[2]= FUNC(biweight_h264_pixels8x16, depth);\ - c->biweight_h264_pixels_tab[3]= FUNC(biweight_h264_pixels8x8, depth);\ - c->biweight_h264_pixels_tab[4]= FUNC(biweight_h264_pixels8x4, depth);\ - c->biweight_h264_pixels_tab[5]= FUNC(biweight_h264_pixels4x8, depth);\ - c->biweight_h264_pixels_tab[6]= FUNC(biweight_h264_pixels4x4, depth);\ - c->biweight_h264_pixels_tab[7]= FUNC(biweight_h264_pixels4x2, depth);\ - c->biweight_h264_pixels_tab[8]= FUNC(biweight_h264_pixels2x4, depth);\ - c->biweight_h264_pixels_tab[9]= FUNC(biweight_h264_pixels2x2, depth);\ + c->weight_h264_pixels_tab[0]= FUNC(weight_h264_pixels16, depth);\ + c->weight_h264_pixels_tab[1]= FUNC(weight_h264_pixels8, depth);\ + c->weight_h264_pixels_tab[2]= FUNC(weight_h264_pixels4, depth);\ + c->weight_h264_pixels_tab[3]= FUNC(weight_h264_pixels2, depth);\ + c->biweight_h264_pixels_tab[0]= FUNC(biweight_h264_pixels16, depth);\ + c->biweight_h264_pixels_tab[1]= FUNC(biweight_h264_pixels8, depth);\ + c->biweight_h264_pixels_tab[2]= FUNC(biweight_h264_pixels4, depth);\ + c->biweight_h264_pixels_tab[3]= FUNC(biweight_h264_pixels2, depth);\ \ c->h264_v_loop_filter_luma= FUNC(h264_v_loop_filter_luma, depth);\ c->h264_h_loop_filter_luma= FUNC(h264_h_loop_filter_luma, depth);\ diff --git a/libavcodec/h264dsp.h b/libavcodec/h264dsp.h index 7337f178e9..7cae215a95 100644 --- a/libavcodec/h264dsp.h +++ b/libavcodec/h264dsp.h @@ -31,16 +31,18 @@ #include "dsputil.h" //typedef void (*h264_chroma_mc_func)(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int srcStride, int h, int x, int y); -typedef void (*h264_weight_func)(uint8_t *block, int stride, int log2_denom, int weight, int offset); -typedef void (*h264_biweight_func)(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset); +typedef void (*h264_weight_func)(uint8_t *block, int stride, int height, + int log2_denom, int weight, int offset); +typedef void (*h264_biweight_func)(uint8_t *dst, uint8_t *src, int stride, int height, + int log2_denom, int weightd, int weights, int offset); /** * Context for storing H.264 DSP functions */ typedef struct H264DSPContext{ /* weighted MC */ - h264_weight_func weight_h264_pixels_tab[10]; - h264_biweight_func biweight_h264_pixels_tab[10]; + h264_weight_func weight_h264_pixels_tab[4]; + h264_biweight_func biweight_h264_pixels_tab[4]; /* loop filter */ void (*h264_v_loop_filter_luma)(uint8_t *pix/*align 16*/, int stride, int alpha, int beta, int8_t *tc0); diff --git a/libavcodec/h264dsp_template.c b/libavcodec/h264dsp_template.c index ee4bbe51dc..3d99cfcfec 100644 --- a/libavcodec/h264dsp_template.c +++ b/libavcodec/h264dsp_template.c @@ -29,14 +29,16 @@ #define op_scale1(x) block[x] = av_clip_pixel( (block[x]*weight + offset) >> log2_denom ) #define op_scale2(x) dst[x] = av_clip_pixel( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1)) -#define H264_WEIGHT(W,H) \ -static void FUNCC(weight_h264_pixels ## W ## x ## H)(uint8_t *_block, int stride, int log2_denom, int weight, int offset){ \ +#define H264_WEIGHT(W) \ +static void FUNCC(weight_h264_pixels ## W)(uint8_t *_block, int stride, int height, \ + int log2_denom, int weight, int offset) \ +{ \ int y; \ pixel *block = (pixel*)_block; \ stride /= sizeof(pixel); \ offset <<= (log2_denom + (BIT_DEPTH-8)); \ if(log2_denom) offset += 1<<(log2_denom-1); \ - for(y=0; ybits_per_raw_sample > 8; @@ -1015,16 +1015,10 @@ void ff_h264dsp_init_ppc(H264DSPContext *c, const int bit_depth, const int chrom c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_altivec; c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_altivec; - c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels16x16_altivec; - c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels16x8_altivec; - c->weight_h264_pixels_tab[2] = ff_weight_h264_pixels8x16_altivec; - c->weight_h264_pixels_tab[3] = ff_weight_h264_pixels8x8_altivec; - c->weight_h264_pixels_tab[4] = ff_weight_h264_pixels8x4_altivec; - c->biweight_h264_pixels_tab[0] = ff_biweight_h264_pixels16x16_altivec; - c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels16x8_altivec; - c->biweight_h264_pixels_tab[2] = ff_biweight_h264_pixels8x16_altivec; - c->biweight_h264_pixels_tab[3] = ff_biweight_h264_pixels8x8_altivec; - c->biweight_h264_pixels_tab[4] = ff_biweight_h264_pixels8x4_altivec; + c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels16_altivec; + c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels8_altivec; + c->biweight_h264_pixels_tab[0] = ff_biweight_h264_pixels16_altivec; + c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels8_altivec; } } } diff --git a/libavcodec/x86/h264_weight.asm b/libavcodec/x86/h264_weight.asm index d80ca32583..bc8bfd686e 100644 --- a/libavcodec/x86/h264_weight.asm +++ b/libavcodec/x86/h264_weight.asm @@ -28,21 +28,20 @@ SECTION .text ;----------------------------------------------------------------------------- ; biweight pred: ; -; void h264_biweight_16x16_sse2(uint8_t *dst, uint8_t *src, int stride, -; int log2_denom, int weightd, int weights, -; int offset); +; void h264_biweight_16_sse2(uint8_t *dst, uint8_t *src, int stride, +; int height, int log2_denom, int weightd, +; int weights, int offset); ; and -; void h264_weight_16x16_sse2(uint8_t *dst, int stride, -; int log2_denom, int weight, -; int offset); +; void h264_weight_16_sse2(uint8_t *dst, int stride, int height, +; int log2_denom, int weight, int offset); ;----------------------------------------------------------------------------- %macro WEIGHT_SETUP 0 - add r4, r4 - inc r4 - movd m3, r3d - movd m5, r4d - movd m6, r2d + add r5, r5 + inc r5 + movd m3, r4d + movd m5, r5d + movd m6, r3d pslld m5, m6 psrld m5, 1 %if mmsize == 16 @@ -71,60 +70,41 @@ SECTION .text packuswb m0, m1 %endmacro -%macro WEIGHT_FUNC_DBL_MM 1 -cglobal h264_weight_16x%1_mmx2, 5, 5, 0 +INIT_MMX +cglobal h264_weight_16_mmx2, 6, 6, 0 WEIGHT_SETUP - mov r2, %1 -%if %1 == 16 .nextrow WEIGHT_OP 0, 4 mova [r0 ], m0 WEIGHT_OP 8, 12 mova [r0+8], m0 add r0, r1 - dec r2 + dec r2d jnz .nextrow REP_RET -%else - jmp mangle(ff_h264_weight_16x16_mmx2.nextrow) -%endif -%endmacro -INIT_MMX -WEIGHT_FUNC_DBL_MM 16 -WEIGHT_FUNC_DBL_MM 8 - -%macro WEIGHT_FUNC_MM 4 -cglobal h264_weight_%1x%2_%4, 7, 7, %3 +%macro WEIGHT_FUNC_MM 3 +cglobal h264_weight_%1_%3, 6, 6, %2 WEIGHT_SETUP - mov r2, %2 -%if %2 == 16 .nextrow WEIGHT_OP 0, mmsize/2 mova [r0], m0 add r0, r1 - dec r2 + dec r2d jnz .nextrow REP_RET -%else - jmp mangle(ff_h264_weight_%1x16_%4.nextrow) -%endif %endmacro INIT_MMX -WEIGHT_FUNC_MM 8, 16, 0, mmx2 -WEIGHT_FUNC_MM 8, 8, 0, mmx2 -WEIGHT_FUNC_MM 8, 4, 0, mmx2 +WEIGHT_FUNC_MM 8, 0, mmx2 INIT_XMM -WEIGHT_FUNC_MM 16, 16, 8, sse2 -WEIGHT_FUNC_MM 16, 8, 8, sse2 +WEIGHT_FUNC_MM 16, 8, sse2 -%macro WEIGHT_FUNC_HALF_MM 5 -cglobal h264_weight_%1x%2_%5, 5, 5, %4 +%macro WEIGHT_FUNC_HALF_MM 3 +cglobal h264_weight_%1_%3, 6, 6, %2 WEIGHT_SETUP - mov r2, %2/2 + sar r2d, 1 lea r3, [r1*2] -%if %2 == mmsize .nextrow WEIGHT_OP 0, r1 movh [r0], m0 @@ -135,31 +115,34 @@ cglobal h264_weight_%1x%2_%5, 5, 5, %4 movh [r0+r1], m0 %endif add r0, r3 - dec r2 + dec r2d jnz .nextrow REP_RET -%else - jmp mangle(ff_h264_weight_%1x%3_%5.nextrow) -%endif %endmacro INIT_MMX -WEIGHT_FUNC_HALF_MM 4, 8, 8, 0, mmx2 -WEIGHT_FUNC_HALF_MM 4, 4, 8, 0, mmx2 -WEIGHT_FUNC_HALF_MM 4, 2, 8, 0, mmx2 +WEIGHT_FUNC_HALF_MM 4, 0, mmx2 +WEIGHT_FUNC_HALF_MM 4, 0, mmx2 +WEIGHT_FUNC_HALF_MM 4, 0, mmx2 INIT_XMM -WEIGHT_FUNC_HALF_MM 8, 16, 16, 8, sse2 -WEIGHT_FUNC_HALF_MM 8, 8, 16, 8, sse2 -WEIGHT_FUNC_HALF_MM 8, 4, 16, 8, sse2 +WEIGHT_FUNC_HALF_MM 8, 8, sse2 +WEIGHT_FUNC_HALF_MM 8, 8, sse2 +WEIGHT_FUNC_HALF_MM 8, 8, sse2 %macro BIWEIGHT_SETUP 0 - add r6, 1 - or r6, 1 - add r3, 1 - movd m3, r4d - movd m4, r5d - movd m5, r6d - movd m6, r3d +%ifdef ARCH_X86_64 +%define off_regd r11d +%else +%define off_regd r3d +%endif + mov off_regd, r7m + add off_regd, 1 + or off_regd, 1 + add r4, 1 + movd m3, r5d + movd m4, r6d + movd m5, off_regd + movd m6, r4d pslld m5, m6 psrld m5, 1 %if mmsize == 16 @@ -195,11 +178,10 @@ WEIGHT_FUNC_HALF_MM 8, 4, 16, 8, sse2 packuswb m0, m1 %endmacro -%macro BIWEIGHT_FUNC_DBL_MM 1 -cglobal h264_biweight_16x%1_mmx2, 7, 7, 0 +INIT_MMX +cglobal h264_biweight_16_mmx2, 7, 7, 0 BIWEIGHT_SETUP - mov r3, %1 -%if %1 == 16 + movifnidn r3d, r3m .nextrow BIWEIGHT_STEPA 0, 1, 0 BIWEIGHT_STEPA 1, 2, 4 @@ -211,23 +193,14 @@ cglobal h264_biweight_16x%1_mmx2, 7, 7, 0 mova [r0+8], m0 add r0, r2 add r1, r2 - dec r3 + dec r3d jnz .nextrow REP_RET -%else - jmp mangle(ff_h264_biweight_16x16_mmx2.nextrow) -%endif -%endmacro -INIT_MMX -BIWEIGHT_FUNC_DBL_MM 16 -BIWEIGHT_FUNC_DBL_MM 8 - -%macro BIWEIGHT_FUNC_MM 4 -cglobal h264_biweight_%1x%2_%4, 7, 7, %3 +%macro BIWEIGHT_FUNC_MM 3 +cglobal h264_biweight_%1_%3, 7, 7, %2 BIWEIGHT_SETUP - mov r3, %2 -%if %2 == 16 + movifnidn r3d, r3m .nextrow BIWEIGHT_STEPA 0, 1, 0 BIWEIGHT_STEPA 1, 2, mmsize/2 @@ -235,28 +208,22 @@ cglobal h264_biweight_%1x%2_%4, 7, 7, %3 mova [r0], m0 add r0, r2 add r1, r2 - dec r3 + dec r3d jnz .nextrow REP_RET -%else - jmp mangle(ff_h264_biweight_%1x16_%4.nextrow) -%endif %endmacro INIT_MMX -BIWEIGHT_FUNC_MM 8, 16, 0, mmx2 -BIWEIGHT_FUNC_MM 8, 8, 0, mmx2 -BIWEIGHT_FUNC_MM 8, 4, 0, mmx2 +BIWEIGHT_FUNC_MM 8, 0, mmx2 INIT_XMM -BIWEIGHT_FUNC_MM 16, 16, 8, sse2 -BIWEIGHT_FUNC_MM 16, 8, 8, sse2 +BIWEIGHT_FUNC_MM 16, 8, sse2 -%macro BIWEIGHT_FUNC_HALF_MM 5 -cglobal h264_biweight_%1x%2_%5, 7, 7, %4 +%macro BIWEIGHT_FUNC_HALF_MM 3 +cglobal h264_biweight_%1_%3, 7, 7, %2 BIWEIGHT_SETUP - mov r3, %2/2 + movifnidn r3d, r3m + sar r3, 1 lea r4, [r2*2] -%if %2 == mmsize .nextrow BIWEIGHT_STEPA 0, 1, 0 BIWEIGHT_STEPA 1, 2, r2 @@ -270,31 +237,30 @@ cglobal h264_biweight_%1x%2_%5, 7, 7, %4 %endif add r0, r4 add r1, r4 - dec r3 + dec r3d jnz .nextrow REP_RET -%else - jmp mangle(ff_h264_biweight_%1x%3_%5.nextrow) -%endif %endmacro INIT_MMX -BIWEIGHT_FUNC_HALF_MM 4, 8, 8, 0, mmx2 -BIWEIGHT_FUNC_HALF_MM 4, 4, 8, 0, mmx2 -BIWEIGHT_FUNC_HALF_MM 4, 2, 8, 0, mmx2 +BIWEIGHT_FUNC_HALF_MM 4, 0, mmx2 INIT_XMM -BIWEIGHT_FUNC_HALF_MM 8, 16, 16, 8, sse2 -BIWEIGHT_FUNC_HALF_MM 8, 8, 16, 8, sse2 -BIWEIGHT_FUNC_HALF_MM 8, 4, 16, 8, sse2 +BIWEIGHT_FUNC_HALF_MM 8, 8, sse2 %macro BIWEIGHT_SSSE3_SETUP 0 - add r6, 1 - or r6, 1 - add r3, 1 - movd m4, r4d - movd m0, r5d - movd m5, r6d - movd m6, r3d +%ifdef ARCH_X86_64 +%define off_regd r11d +%else +%define off_regd r3d +%endif + mov off_regd, r7m + add off_regd, 1 + or off_regd, 1 + add r4, 1 + movd m4, r5d + movd m0, r6d + movd m5, off_regd + movd m6, r4d pslld m5, m6 psrld m5, 1 punpcklbw m4, m0 @@ -314,12 +280,11 @@ BIWEIGHT_FUNC_HALF_MM 8, 4, 16, 8, sse2 packuswb m0, m2 %endmacro -%macro BIWEIGHT_SSSE3_16 1 -cglobal h264_biweight_16x%1_ssse3, 7, 7, 8 +INIT_XMM +cglobal h264_biweight_16_ssse3, 7, 7, 8 BIWEIGHT_SSSE3_SETUP - mov r3, %1 + movifnidn r3d, r3m -%if %1 == 16 .nextrow movh m0, [r0] movh m2, [r0+8] @@ -330,25 +295,17 @@ cglobal h264_biweight_16x%1_ssse3, 7, 7, 8 mova [r0], m0 add r0, r2 add r1, r2 - dec r3 + dec r3d jnz .nextrow REP_RET -%else - jmp mangle(ff_h264_biweight_16x16_ssse3.nextrow) -%endif -%endmacro INIT_XMM -BIWEIGHT_SSSE3_16 16 -BIWEIGHT_SSSE3_16 8 - -%macro BIWEIGHT_SSSE3_8 1 -cglobal h264_biweight_8x%1_ssse3, 7, 7, 8 +cglobal h264_biweight_8_ssse3, 7, 7, 8 BIWEIGHT_SSSE3_SETUP - mov r3, %1/2 + movifnidn r3d, r3m + sar r3, 1 lea r4, [r2*2] -%if %1 == 16 .nextrow movh m0, [r0] movh m1, [r1] @@ -361,15 +318,6 @@ cglobal h264_biweight_8x%1_ssse3, 7, 7, 8 movhps [r0+r2], m0 add r0, r4 add r1, r4 - dec r3 + dec r3d jnz .nextrow REP_RET -%else - jmp mangle(ff_h264_biweight_8x16_ssse3.nextrow) -%endif -%endmacro - -INIT_XMM -BIWEIGHT_SSSE3_8 16 -BIWEIGHT_SSSE3_8 8 -BIWEIGHT_SSSE3_8 4 diff --git a/libavcodec/x86/h264_weight_10bit.asm b/libavcodec/x86/h264_weight_10bit.asm index 1c58d72d94..20df6fbab5 100644 --- a/libavcodec/x86/h264_weight_10bit.asm +++ b/libavcodec/x86/h264_weight_10bit.asm @@ -36,33 +36,26 @@ cextern pw_1 SECTION .text ;----------------------------------------------------------------------------- -; void h264_weight(uint8_t *dst, int stride, int log2_denom, +; void h264_weight(uint8_t *dst, int stride, int height, int log2_denom, ; int weight, int offset); ;----------------------------------------------------------------------------- -%ifdef ARCH_X86_32 -DECLARE_REG_TMP 2 -%else -DECLARE_REG_TMP 10 -%endif - -%macro WEIGHT_PROLOGUE 1 - mov t0, %1 +%macro WEIGHT_PROLOGUE 0 .prologue - PROLOGUE 0,5,8 + PROLOGUE 0,6,8 movifnidn r0, r0mp movifnidn r1d, r1m - movifnidn r3d, r3m movifnidn r4d, r4m + movifnidn r5d, r5m %endmacro %macro WEIGHT_SETUP 1 mova m0, [pw_1] - movd m2, r2m + movd m2, r3m pslld m0, m2 ; 1<h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_mmxext; c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_mmxext; #endif - c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_mmx2; - c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_mmx2; - c->weight_h264_pixels_tab[2]= ff_h264_weight_8x16_mmx2; - c->weight_h264_pixels_tab[3]= ff_h264_weight_8x8_mmx2; - c->weight_h264_pixels_tab[4]= ff_h264_weight_8x4_mmx2; - c->weight_h264_pixels_tab[5]= ff_h264_weight_4x8_mmx2; - c->weight_h264_pixels_tab[6]= ff_h264_weight_4x4_mmx2; - c->weight_h264_pixels_tab[7]= ff_h264_weight_4x2_mmx2; - - c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_mmx2; - c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_mmx2; - c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_mmx2; - c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_mmx2; - c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_mmx2; - c->biweight_h264_pixels_tab[5]= ff_h264_biweight_4x8_mmx2; - c->biweight_h264_pixels_tab[6]= ff_h264_biweight_4x4_mmx2; - c->biweight_h264_pixels_tab[7]= ff_h264_biweight_4x2_mmx2; + c->weight_h264_pixels_tab[0]= ff_h264_weight_16_mmx2; + c->weight_h264_pixels_tab[1]= ff_h264_weight_8_mmx2; + c->weight_h264_pixels_tab[2]= ff_h264_weight_4_mmx2; + + c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16_mmx2; + c->biweight_h264_pixels_tab[1]= ff_h264_biweight_8_mmx2; + c->biweight_h264_pixels_tab[2]= ff_h264_biweight_4_mmx2; if (mm_flags&AV_CPU_FLAG_SSE2) { c->h264_idct8_add = ff_h264_idct8_add_8_sse2; @@ -422,17 +402,11 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, const int chrom c->h264_idct_add16intra = ff_h264_idct_add16intra_8_sse2; c->h264_luma_dc_dequant_idct= ff_h264_luma_dc_dequant_idct_sse2; - c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_sse2; - c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_sse2; - c->weight_h264_pixels_tab[2]= ff_h264_weight_8x16_sse2; - c->weight_h264_pixels_tab[3]= ff_h264_weight_8x8_sse2; - c->weight_h264_pixels_tab[4]= ff_h264_weight_8x4_sse2; + c->weight_h264_pixels_tab[0]= ff_h264_weight_16_sse2; + c->weight_h264_pixels_tab[1]= ff_h264_weight_8_sse2; - c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_sse2; - c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_sse2; - c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_sse2; - c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_sse2; - c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_sse2; + c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16_sse2; + c->biweight_h264_pixels_tab[1]= ff_h264_biweight_8_sse2; #if HAVE_ALIGNED_STACK c->h264_v_loop_filter_luma = ff_deblock_v_luma_8_sse2; @@ -442,11 +416,8 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, const int chrom #endif } if (mm_flags&AV_CPU_FLAG_SSSE3) { - c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_ssse3; - c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_ssse3; - c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_ssse3; - c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_ssse3; - c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_ssse3; + c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16_ssse3; + c->biweight_h264_pixels_tab[1]= ff_h264_biweight_8_ssse3; } if (mm_flags&AV_CPU_FLAG_AVX) { #if HAVE_ALIGNED_STACK @@ -485,23 +456,13 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, const int chrom c->h264_idct8_add4 = ff_h264_idct8_add4_10_sse2; #endif - c->weight_h264_pixels_tab[0] = ff_h264_weight_16x16_10_sse2; - c->weight_h264_pixels_tab[1] = ff_h264_weight_16x8_10_sse2; - c->weight_h264_pixels_tab[2] = ff_h264_weight_8x16_10_sse2; - c->weight_h264_pixels_tab[3] = ff_h264_weight_8x8_10_sse2; - c->weight_h264_pixels_tab[4] = ff_h264_weight_8x4_10_sse2; - c->weight_h264_pixels_tab[5] = ff_h264_weight_4x8_10_sse2; - c->weight_h264_pixels_tab[6] = ff_h264_weight_4x4_10_sse2; - c->weight_h264_pixels_tab[7] = ff_h264_weight_4x2_10_sse2; - - c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16x16_10_sse2; - c->biweight_h264_pixels_tab[1] = ff_h264_biweight_16x8_10_sse2; - c->biweight_h264_pixels_tab[2] = ff_h264_biweight_8x16_10_sse2; - c->biweight_h264_pixels_tab[3] = ff_h264_biweight_8x8_10_sse2; - c->biweight_h264_pixels_tab[4] = ff_h264_biweight_8x4_10_sse2; - c->biweight_h264_pixels_tab[5] = ff_h264_biweight_4x8_10_sse2; - c->biweight_h264_pixels_tab[6] = ff_h264_biweight_4x4_10_sse2; - c->biweight_h264_pixels_tab[7] = ff_h264_biweight_4x2_10_sse2; + c->weight_h264_pixels_tab[0] = ff_h264_weight_16_10_sse2; + c->weight_h264_pixels_tab[1] = ff_h264_weight_8_10_sse2; + c->weight_h264_pixels_tab[2] = ff_h264_weight_4_10_sse2; + + c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_10_sse2; + c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_10_sse2; + c->biweight_h264_pixels_tab[2] = ff_h264_biweight_4_10_sse2; c->h264_v_loop_filter_chroma= ff_deblock_v_chroma_10_sse2; c->h264_v_loop_filter_chroma_intra= ff_deblock_v_chroma_intra_10_sse2; @@ -513,23 +474,13 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, const int chrom #endif } if (mm_flags&AV_CPU_FLAG_SSE4) { - c->weight_h264_pixels_tab[0] = ff_h264_weight_16x16_10_sse4; - c->weight_h264_pixels_tab[1] = ff_h264_weight_16x8_10_sse4; - c->weight_h264_pixels_tab[2] = ff_h264_weight_8x16_10_sse4; - c->weight_h264_pixels_tab[3] = ff_h264_weight_8x8_10_sse4; - c->weight_h264_pixels_tab[4] = ff_h264_weight_8x4_10_sse4; - c->weight_h264_pixels_tab[5] = ff_h264_weight_4x8_10_sse4; - c->weight_h264_pixels_tab[6] = ff_h264_weight_4x4_10_sse4; - c->weight_h264_pixels_tab[7] = ff_h264_weight_4x2_10_sse4; - - c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16x16_10_sse4; - c->biweight_h264_pixels_tab[1] = ff_h264_biweight_16x8_10_sse4; - c->biweight_h264_pixels_tab[2] = ff_h264_biweight_8x16_10_sse4; - c->biweight_h264_pixels_tab[3] = ff_h264_biweight_8x8_10_sse4; - c->biweight_h264_pixels_tab[4] = ff_h264_biweight_8x4_10_sse4; - c->biweight_h264_pixels_tab[5] = ff_h264_biweight_4x8_10_sse4; - c->biweight_h264_pixels_tab[6] = ff_h264_biweight_4x4_10_sse4; - c->biweight_h264_pixels_tab[7] = ff_h264_biweight_4x2_10_sse4; + c->weight_h264_pixels_tab[0] = ff_h264_weight_16_10_sse4; + c->weight_h264_pixels_tab[1] = ff_h264_weight_8_10_sse4; + c->weight_h264_pixels_tab[2] = ff_h264_weight_4_10_sse4; + + c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_10_sse4; + c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_10_sse4; + c->biweight_h264_pixels_tab[2] = ff_h264_biweight_4_10_sse4; } #if HAVE_AVX if (mm_flags&AV_CPU_FLAG_AVX) { From 05fb63f5a070154aa7e681fa8617a5322322559a Mon Sep 17 00:00:00 2001 From: "Ronald S. Bultje" Date: Fri, 21 Oct 2011 00:01:16 -0700 Subject: [PATCH 08/35] H264: have hl_motion() and its callees take a chroma_idc argument. --- libavcodec/h264.c | 150 ++++++++++++++++++++++++++++++---------------- 1 file changed, 98 insertions(+), 52 deletions(-) diff --git a/libavcodec/h264.c b/libavcodec/h264.c index 7306828197..0525df3712 100644 --- a/libavcodec/h264.c +++ b/libavcodec/h264.c @@ -440,10 +440,11 @@ static void chroma_dc_dct_c(DCTELEM *block){ static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square, int height, int delta, int list, - uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, - int src_x_offset, int src_y_offset, - qpel_mc_func *qpix_op, h264_chroma_mc_func chroma_op, - int pixel_shift, int chroma444){ + uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, + int src_x_offset, int src_y_offset, + qpel_mc_func *qpix_op, h264_chroma_mc_func chroma_op, + int pixel_shift, int chroma_idc) +{ MpegEncContext * const s = &h->s; const int mx= h->mv_cache[list][ scan8[n] ][0] + src_x_offset*8; int my= h->mv_cache[list][ scan8[n] ][1] + src_y_offset*8; @@ -480,7 +481,7 @@ static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square, if(CONFIG_GRAY && s->flags&CODEC_FLAG_GRAY) return; - if(chroma444){ + if(chroma_idc == 3 /* yuv444 */){ src_cb = pic->f.data[1] + offset; if(emu){ s->dsp.emulated_edge_mc(s->edge_emu_buffer, src_cb - (2 << pixel_shift) - 2*h->mb_linesize, h->mb_linesize, @@ -505,8 +506,8 @@ static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square, return; } - ysh = 3 - !!(CHROMA422); - if(!CHROMA422 && MB_FIELD){ + ysh = 3 - (chroma_idc == 2 /* yuv422 */); + if(chroma_idc == 1 /* yuv420 */ && MB_FIELD){ // chroma offset when predicting from a field of opposite parity my += 2 * ((s->mb_y & 1) - (pic->f.reference - 1)); emu |= (my>>3) < 0 || (my>>3) + 8 >= (pic_height>>1); @@ -516,16 +517,22 @@ static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square, src_cr = pic->f.data[2] + ((mx >> 3) << pixel_shift) + (my >> ysh) * h->mb_uvlinesize; if(emu){ - s->dsp.emulated_edge_mc(s->edge_emu_buffer, src_cb, h->mb_uvlinesize, 9, (16>>!(CHROMA422)) + 1, (mx>>3), (my>>ysh), pic_width>>1, pic_height>>!(CHROMA422)); + s->dsp.emulated_edge_mc(s->edge_emu_buffer, src_cb, h->mb_uvlinesize, + 9, 8 * chroma_idc + 1, (mx >> 3), (my >> ysh), + pic_width >> 1, pic_height >> (chroma_idc == 1 /* yuv420 */)); src_cb= s->edge_emu_buffer; } - chroma_op(dest_cb, src_cb, h->mb_uvlinesize, height >> !(CHROMA422), mx&7, (my << !!(CHROMA422)) &7); + chroma_op(dest_cb, src_cb, h->mb_uvlinesize, height >> (chroma_idc == 1 /* yuv420 */), + mx&7, (my << (chroma_idc == 2 /* yuv422 */)) &7); if(emu){ - s->dsp.emulated_edge_mc(s->edge_emu_buffer, src_cr, h->mb_uvlinesize, 9, (16>>!(CHROMA422)) + 1, (mx>>3), (my>>ysh), pic_width>>1, pic_height>>!(CHROMA422)); + s->dsp.emulated_edge_mc(s->edge_emu_buffer, src_cr, h->mb_uvlinesize, + 9, 8 * chroma_idc + 1, (mx >> 3), (my >> ysh), + pic_width >> 1, pic_height >> (chroma_idc == 1 /* yuv420 */)); src_cr= s->edge_emu_buffer; } - chroma_op(dest_cr, src_cr, h->mb_uvlinesize, height >> !(CHROMA422), mx&7, (my << !!(CHROMA422)) &7); + chroma_op(dest_cr, src_cr, h->mb_uvlinesize, height >> (chroma_idc == 1 /* yuv420 */), + mx&7, (my << (chroma_idc == 2 /* yuv422 */)) &7); } static inline void mc_part_std(H264Context *h, int n, int square, int height, int delta, @@ -533,19 +540,20 @@ static inline void mc_part_std(H264Context *h, int n, int square, int height, in int x_offset, int y_offset, qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put, qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg, - int list0, int list1, int pixel_shift, int chroma444){ + int list0, int list1, int pixel_shift, int chroma_idc) +{ MpegEncContext * const s = &h->s; qpel_mc_func *qpix_op= qpix_put; h264_chroma_mc_func chroma_op= chroma_put; dest_y += (2*x_offset << pixel_shift) + 2*y_offset*h->mb_linesize; - if(chroma444){ + if (chroma_idc == 3 /* yuv444 */) { dest_cb += (2*x_offset << pixel_shift) + 2*y_offset*h->mb_linesize; dest_cr += (2*x_offset << pixel_shift) + 2*y_offset*h->mb_linesize; - } else if (CHROMA422) { + } else if (chroma_idc == 2 /* yuv422 */) { dest_cb += ( x_offset << pixel_shift) + 2*y_offset*h->mb_uvlinesize; dest_cr += ( x_offset << pixel_shift) + 2*y_offset*h->mb_uvlinesize; - }else{ + } else /* yuv420 */ { dest_cb += ( x_offset << pixel_shift) + y_offset*h->mb_uvlinesize; dest_cr += ( x_offset << pixel_shift) + y_offset*h->mb_uvlinesize; } @@ -556,7 +564,7 @@ static inline void mc_part_std(H264Context *h, int n, int square, int height, in Picture *ref= &h->ref_list[0][ h->ref_cache[0][ scan8[n] ] ]; mc_dir_part(h, ref, n, square, height, delta, 0, dest_y, dest_cb, dest_cr, x_offset, y_offset, - qpix_op, chroma_op, pixel_shift, chroma444); + qpix_op, chroma_op, pixel_shift, chroma_idc); qpix_op= qpix_avg; chroma_op= chroma_avg; @@ -566,7 +574,7 @@ static inline void mc_part_std(H264Context *h, int n, int square, int height, in Picture *ref= &h->ref_list[1][ h->ref_cache[1][ scan8[n] ] ]; mc_dir_part(h, ref, n, square, height, delta, 1, dest_y, dest_cb, dest_cr, x_offset, y_offset, - qpix_op, chroma_op, pixel_shift, chroma444); + qpix_op, chroma_op, pixel_shift, chroma_idc); } } @@ -576,22 +584,22 @@ static inline void mc_part_weighted(H264Context *h, int n, int square, int heigh qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put, h264_weight_func luma_weight_op, h264_weight_func chroma_weight_op, h264_biweight_func luma_weight_avg, h264_biweight_func chroma_weight_avg, - int list0, int list1, int pixel_shift, int chroma444){ + int list0, int list1, int pixel_shift, int chroma_idc){ MpegEncContext * const s = &h->s; int chroma_height; dest_y += (2*x_offset << pixel_shift) + 2*y_offset*h->mb_linesize; - if(chroma444){ + if (chroma_idc == 3 /* yuv444 */) { chroma_height = height; chroma_weight_avg = luma_weight_avg; chroma_weight_op = luma_weight_op; dest_cb += (2*x_offset << pixel_shift) + 2*y_offset*h->mb_linesize; dest_cr += (2*x_offset << pixel_shift) + 2*y_offset*h->mb_linesize; - } else if (CHROMA422) { + } else if (chroma_idc == 2 /* yuv422 */) { chroma_height = height; dest_cb += ( x_offset << pixel_shift) + 2*y_offset*h->mb_uvlinesize; dest_cr += ( x_offset << pixel_shift) + 2*y_offset*h->mb_uvlinesize; - }else{ + } else /* yuv420 */ { chroma_height = height >> 1; dest_cb += ( x_offset << pixel_shift) + y_offset*h->mb_uvlinesize; dest_cr += ( x_offset << pixel_shift) + y_offset*h->mb_uvlinesize; @@ -610,10 +618,12 @@ static inline void mc_part_weighted(H264Context *h, int n, int square, int heigh mc_dir_part(h, &h->ref_list[0][refn0], n, square, height, delta, 0, dest_y, dest_cb, dest_cr, - x_offset, y_offset, qpix_put, chroma_put, pixel_shift, chroma444); + x_offset, y_offset, qpix_put, chroma_put, + pixel_shift, chroma_idc); mc_dir_part(h, &h->ref_list[1][refn1], n, square, height, delta, 1, tmp_y, tmp_cb, tmp_cr, - x_offset, y_offset, qpix_put, chroma_put, pixel_shift, chroma444); + x_offset, y_offset, qpix_put, chroma_put, + pixel_shift, chroma_idc); if(h->use_weight == 2){ int weight0 = h->implicit_weight[refn0][refn1][s->mb_y&1]; @@ -641,7 +651,7 @@ static inline void mc_part_weighted(H264Context *h, int n, int square, int heigh Picture *ref= &h->ref_list[list][refn]; mc_dir_part(h, ref, n, square, height, delta, list, dest_y, dest_cb, dest_cr, x_offset, y_offset, - qpix_put, chroma_put, pixel_shift, chroma444); + qpix_put, chroma_put, pixel_shift, chroma_idc); luma_weight_op(dest_y, h->mb_linesize, height, h->luma_log2_weight_denom, h->luma_weight[refn][list][0], h->luma_weight[refn][list][1]); @@ -660,21 +670,22 @@ static inline void mc_part(H264Context *h, int n, int square, int height, int de qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put, qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg, h264_weight_func *weight_op, h264_biweight_func *weight_avg, - int list0, int list1, int pixel_shift, int chroma444){ + int list0, int list1, int pixel_shift, int chroma_idc) +{ if((h->use_weight==2 && list0 && list1 && (h->implicit_weight[ h->ref_cache[0][scan8[n]] ][ h->ref_cache[1][scan8[n]] ][h->s.mb_y&1] != 32)) || h->use_weight==1) mc_part_weighted(h, n, square, height, delta, dest_y, dest_cb, dest_cr, x_offset, y_offset, qpix_put, chroma_put, weight_op[0], weight_op[1], weight_avg[0], - weight_avg[1], list0, list1, pixel_shift, chroma444); + weight_avg[1], list0, list1, pixel_shift, chroma_idc); else mc_part_std(h, n, square, height, delta, dest_y, dest_cb, dest_cr, x_offset, y_offset, qpix_put, chroma_put, qpix_avg, - chroma_avg, list0, list1, pixel_shift, chroma444); + chroma_avg, list0, list1, pixel_shift, chroma_idc); } -static inline void prefetch_motion(H264Context *h, int list, int pixel_shift, int chroma444){ +static inline void prefetch_motion(H264Context *h, int list, int pixel_shift, int chroma_idc){ /* fetch pixels for estimated mv 4 macroblocks ahead * optimized for 64byte cache lines */ MpegEncContext * const s = &h->s; @@ -685,7 +696,7 @@ static inline void prefetch_motion(H264Context *h, int list, int pixel_shift, in uint8_t **src = h->ref_list[list][refn].f.data; int off= (mx << pixel_shift) + (my + (s->mb_x&3)*4)*h->mb_linesize + (64 << pixel_shift); s->dsp.prefetch(src[0]+off, s->linesize, 4); - if(chroma444){ + if (chroma_idc == 3 /* yuv444 */) { s->dsp.prefetch(src[1]+off, s->linesize, 4); s->dsp.prefetch(src[2]+off, s->linesize, 4); }else{ @@ -699,7 +710,8 @@ static av_always_inline void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t qpel_mc_func (*qpix_put)[16], h264_chroma_mc_func (*chroma_put), qpel_mc_func (*qpix_avg)[16], h264_chroma_mc_func (*chroma_avg), h264_weight_func *weight_op, h264_biweight_func *weight_avg, - int pixel_shift, int chroma444){ + int pixel_shift, int chroma_idc) +{ MpegEncContext * const s = &h->s; const int mb_xy= h->mb_xy; const int mb_type = s->current_picture.f.mb_type[mb_xy]; @@ -708,36 +720,36 @@ static av_always_inline void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t if(HAVE_THREADS && (s->avctx->active_thread_type & FF_THREAD_FRAME)) await_references(h); - prefetch_motion(h, 0, pixel_shift, chroma444); + prefetch_motion(h, 0, pixel_shift, chroma_idc); if(IS_16X16(mb_type)){ mc_part(h, 0, 1, 16, 0, dest_y, dest_cb, dest_cr, 0, 0, qpix_put[0], chroma_put[0], qpix_avg[0], chroma_avg[0], weight_op, weight_avg, IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1), - pixel_shift, chroma444); + pixel_shift, chroma_idc); }else if(IS_16X8(mb_type)){ mc_part(h, 0, 0, 8, 8 << pixel_shift, dest_y, dest_cb, dest_cr, 0, 0, qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0], weight_op, weight_avg, IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1), - pixel_shift, chroma444); + pixel_shift, chroma_idc); mc_part(h, 8, 0, 8, 8 << pixel_shift, dest_y, dest_cb, dest_cr, 0, 4, qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0], weight_op, weight_avg, IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1), - pixel_shift, chroma444); + pixel_shift, chroma_idc); }else if(IS_8X16(mb_type)){ mc_part(h, 0, 0, 16, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 0, 0, qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1], &weight_op[1], &weight_avg[1], IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1), - pixel_shift, chroma444); + pixel_shift, chroma_idc); mc_part(h, 4, 0, 16, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 4, 0, qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1], &weight_op[1], &weight_avg[1], IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1), - pixel_shift, chroma444); + pixel_shift, chroma_idc); }else{ int i; @@ -754,29 +766,29 @@ static av_always_inline void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1], &weight_op[1], &weight_avg[1], IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1), - pixel_shift, chroma444); + pixel_shift, chroma_idc); }else if(IS_SUB_8X4(sub_mb_type)){ mc_part(h, n , 0, 4, 4 << pixel_shift, dest_y, dest_cb, dest_cr, x_offset, y_offset, qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1], &weight_op[1], &weight_avg[1], IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1), - pixel_shift, chroma444); + pixel_shift, chroma_idc); mc_part(h, n+2, 0, 4, 4 << pixel_shift, dest_y, dest_cb, dest_cr, x_offset, y_offset+2, qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1], &weight_op[1], &weight_avg[1], IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1), - pixel_shift, chroma444); + pixel_shift, chroma_idc); }else if(IS_SUB_4X8(sub_mb_type)){ mc_part(h, n , 0, 8, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset, y_offset, qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2], &weight_op[2], &weight_avg[2], IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1), - pixel_shift, chroma444); + pixel_shift, chroma_idc); mc_part(h, n+1, 0, 8, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset+2, y_offset, qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2], &weight_op[2], &weight_avg[2], IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1), - pixel_shift, chroma444); + pixel_shift, chroma_idc); }else{ int j; assert(IS_SUB_4X4(sub_mb_type)); @@ -787,13 +799,35 @@ static av_always_inline void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2], &weight_op[2], &weight_avg[2], IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1), - pixel_shift, chroma444); + pixel_shift, chroma_idc); } } } } - prefetch_motion(h, 1, pixel_shift, chroma444); + prefetch_motion(h, 1, pixel_shift, chroma_idc); +} + +static av_always_inline void +hl_motion_420(H264Context *h, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, + qpel_mc_func (*qpix_put)[16], h264_chroma_mc_func (*chroma_put), + qpel_mc_func (*qpix_avg)[16], h264_chroma_mc_func (*chroma_avg), + h264_weight_func *weight_op, h264_biweight_func *weight_avg, + int pixel_shift) +{ + hl_motion(h, dest_y, dest_cb, dest_cr, qpix_put, chroma_put, + qpix_avg, chroma_avg, weight_op, weight_avg, pixel_shift, 1); +} + +static av_always_inline void +hl_motion_422(H264Context *h, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, + qpel_mc_func (*qpix_put)[16], h264_chroma_mc_func (*chroma_put), + qpel_mc_func (*qpix_avg)[16], h264_chroma_mc_func (*chroma_avg), + h264_weight_func *weight_op, h264_biweight_func *weight_avg, + int pixel_shift) +{ + hl_motion(h, dest_y, dest_cb, dest_cr, qpix_put, chroma_put, + qpix_avg, chroma_avg, weight_op, weight_avg, pixel_shift, 2); } static void free_tables(H264Context *h, int free_rbsp){ @@ -1798,7 +1832,8 @@ static av_always_inline void hl_decode_mb_idct_luma(H264Context *h, int mb_type, } } -static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple, int pixel_shift){ +static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple, int pixel_shift) +{ MpegEncContext * const s = &h->s; const int mb_x= s->mb_x; const int mb_y= s->mb_y; @@ -1813,6 +1848,7 @@ static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple, i const int is_h264 = !CONFIG_SVQ3_DECODER || simple || s->codec_id == CODEC_ID_H264; void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride); const int block_h = 16 >> s->chroma_y_shift; + const int chroma422 = CHROMA422; dest_y = s->current_picture.f.data[0] + ((mb_x << pixel_shift) + mb_y * s->linesize ) * 16; dest_cb = s->current_picture.f.data[1] + (mb_x << pixel_shift)*8 + mb_y * s->uvlinesize * block_h; @@ -1927,11 +1963,21 @@ static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple, i if(h->deblocking_filter) xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0, 0, simple, pixel_shift); }else if(is_h264){ - hl_motion(h, dest_y, dest_cb, dest_cr, - s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab, - s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab, - h->h264dsp.weight_h264_pixels_tab, - h->h264dsp.biweight_h264_pixels_tab, pixel_shift, 0); + if (chroma422) { + hl_motion_422(h, dest_y, dest_cb, dest_cr, + s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab, + s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab, + h->h264dsp.weight_h264_pixels_tab, + h->h264dsp.biweight_h264_pixels_tab, + pixel_shift); + } else { + hl_motion_420(h, dest_y, dest_cb, dest_cr, + s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab, + s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab, + h->h264dsp.weight_h264_pixels_tab, + h->h264dsp.biweight_h264_pixels_tab, + pixel_shift); + } } hl_decode_mb_idct_luma(h, mb_type, is_h264, simple, transform_bypass, pixel_shift, block_offset, linesize, dest_y, 0); @@ -1949,7 +1995,7 @@ static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple, i if(h->non_zero_count_cache[ scan8[i] ] || dctcoef_get(h->mb, pixel_shift, i*16)) idct_add (dest[j-1] + block_offset[i], h->mb + (i*16 << pixel_shift), uvlinesize); } - if (CHROMA422) { + if (chroma422) { for(i=j*16+4; inon_zero_count_cache[ scan8[i] ] || dctcoef_get(h->mb, pixel_shift, i*16)) idct_add (dest[j-1] + block_offset[i+4], h->mb + (i*16 << pixel_shift), uvlinesize); @@ -1960,7 +2006,7 @@ static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple, i }else{ if(is_h264){ int qp[2]; - if (CHROMA422) { + if (chroma422) { qp[0] = h->chroma_qp[0] + 3; qp[1] = h->chroma_qp[1] + 3; } else { @@ -2079,7 +2125,7 @@ static av_always_inline void hl_decode_mb_444_internal(H264Context *h, int simpl s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab, s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab, h->h264dsp.weight_h264_pixels_tab, - h->h264dsp.biweight_h264_pixels_tab, pixel_shift, 1); + h->h264dsp.biweight_h264_pixels_tab, pixel_shift, 3); } for (p = 0; p < plane_count; p++) From 27209bb108c8a3d2c0de2c36dfb973667df24017 Mon Sep 17 00:00:00 2001 From: "Ronald S. Bultje" Date: Thu, 20 Oct 2011 23:36:23 -0700 Subject: [PATCH 09/35] h264: mark some MC functions with av_always_inline instead of inline. This actually causes them to be inlined, leading to a significant speedup (1-1.5% in my measurements). --- libavcodec/h264.c | 60 ++++++++++++++++++++++++++--------------------- 1 file changed, 33 insertions(+), 27 deletions(-) diff --git a/libavcodec/h264.c b/libavcodec/h264.c index 0525df3712..17124c3088 100644 --- a/libavcodec/h264.c +++ b/libavcodec/h264.c @@ -438,12 +438,13 @@ static void chroma_dc_dct_c(DCTELEM *block){ } #endif -static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square, - int height, int delta, int list, - uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, - int src_x_offset, int src_y_offset, - qpel_mc_func *qpix_op, h264_chroma_mc_func chroma_op, - int pixel_shift, int chroma_idc) +static av_always_inline void +mc_dir_part(H264Context *h, Picture *pic, int n, int square, + int height, int delta, int list, + uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, + int src_x_offset, int src_y_offset, + qpel_mc_func *qpix_op, h264_chroma_mc_func chroma_op, + int pixel_shift, int chroma_idc) { MpegEncContext * const s = &h->s; const int mx= h->mv_cache[list][ scan8[n] ][0] + src_x_offset*8; @@ -535,12 +536,13 @@ static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square, mx&7, (my << (chroma_idc == 2 /* yuv422 */)) &7); } -static inline void mc_part_std(H264Context *h, int n, int square, int height, int delta, - uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, - int x_offset, int y_offset, - qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put, - qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg, - int list0, int list1, int pixel_shift, int chroma_idc) +static av_always_inline void +mc_part_std(H264Context *h, int n, int square, int height, int delta, + uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, + int x_offset, int y_offset, + qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put, + qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg, + int list0, int list1, int pixel_shift, int chroma_idc) { MpegEncContext * const s = &h->s; qpel_mc_func *qpix_op= qpix_put; @@ -578,13 +580,14 @@ static inline void mc_part_std(H264Context *h, int n, int square, int height, in } } -static inline void mc_part_weighted(H264Context *h, int n, int square, int height, int delta, - uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, - int x_offset, int y_offset, - qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put, - h264_weight_func luma_weight_op, h264_weight_func chroma_weight_op, - h264_biweight_func luma_weight_avg, h264_biweight_func chroma_weight_avg, - int list0, int list1, int pixel_shift, int chroma_idc){ +static av_always_inline void +mc_part_weighted(H264Context *h, int n, int square, int height, int delta, + uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, + int x_offset, int y_offset, + qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put, + h264_weight_func luma_weight_op, h264_weight_func chroma_weight_op, + h264_biweight_func luma_weight_avg, h264_biweight_func chroma_weight_avg, + int list0, int list1, int pixel_shift, int chroma_idc){ MpegEncContext * const s = &h->s; int chroma_height; @@ -664,13 +667,14 @@ static inline void mc_part_weighted(H264Context *h, int n, int square, int heigh } } -static inline void mc_part(H264Context *h, int n, int square, int height, int delta, - uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, - int x_offset, int y_offset, - qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put, - qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg, - h264_weight_func *weight_op, h264_biweight_func *weight_avg, - int list0, int list1, int pixel_shift, int chroma_idc) +static av_always_inline void +mc_part(H264Context *h, int n, int square, int height, int delta, + uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, + int x_offset, int y_offset, + qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put, + qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg, + h264_weight_func *weight_op, h264_biweight_func *weight_avg, + int list0, int list1, int pixel_shift, int chroma_idc) { if((h->use_weight==2 && list0 && list1 && (h->implicit_weight[ h->ref_cache[0][scan8[n]] ][ h->ref_cache[1][scan8[n]] ][h->s.mb_y&1] != 32)) @@ -685,7 +689,9 @@ static inline void mc_part(H264Context *h, int n, int square, int height, int de chroma_avg, list0, list1, pixel_shift, chroma_idc); } -static inline void prefetch_motion(H264Context *h, int list, int pixel_shift, int chroma_idc){ +static av_always_inline void +prefetch_motion(H264Context *h, int list, int pixel_shift, int chroma_idc) +{ /* fetch pixels for estimated mv 4 macroblocks ahead * optimized for 64byte cache lines */ MpegEncContext * const s = &h->s; From b8bb9c026789ca9cd6d7a3a6263fc6e8a3467767 Mon Sep 17 00:00:00 2001 From: Jean First Date: Fri, 30 Sep 2011 09:42:45 +0200 Subject: [PATCH 10/35] Enable multithreding when decoding with libopenjpeg Enable multithreding when decoding with libopenjpeg Signed-off-by: Michael Niedermayer Signed-off-by: Ronald S. Bultje --- libavcodec/libopenjpeg.c | 45 ++++++++++++++++++++++++++++++++-------- 1 file changed, 36 insertions(+), 9 deletions(-) diff --git a/libavcodec/libopenjpeg.c b/libavcodec/libopenjpeg.c index 42809b992f..1facd21044 100644 --- a/libavcodec/libopenjpeg.c +++ b/libavcodec/libopenjpeg.c @@ -27,6 +27,7 @@ #include "libavutil/imgutils.h" #include "avcodec.h" #include "libavutil/intreadwrite.h" +#include "thread.h" #define OPJ_STATIC #include @@ -57,6 +58,14 @@ static av_cold int libopenjpeg_decode_init(AVCodecContext *avctx) return 0; } +static av_cold int libopenjpeg_decode_init_thread_copy(AVCodecContext *avctx) +{ + LibOpenJPEGContext *ctx = avctx->priv_data; + + avctx->coded_frame = &ctx->image; + return 0; +} + static int libopenjpeg_decode_frame(AVCodecContext *avctx, void *data, int *data_size, AVPacket *avpkt) @@ -94,7 +103,7 @@ static int libopenjpeg_decode_frame(AVCodecContext *avctx, } opj_set_event_mgr((opj_common_ptr)dec, NULL, NULL); - ctx->dec_params.cp_reduce = avctx->lowres; + ctx->dec_params.cp_limit_decoding = LIMIT_TO_MAIN_HEADER; // Tie decoder with decoding parameters opj_setup_decoder(dec, &ctx->dec_params); stream = opj_cio_open((opj_common_ptr)dec, buf, buf_size); @@ -104,7 +113,7 @@ static int libopenjpeg_decode_frame(AVCodecContext *avctx, return -1; } - // Decode the codestream + // Decode the header only image = opj_decode_with_info(dec, stream, NULL); opj_cio_close(stream); if(!image) { @@ -112,8 +121,8 @@ static int libopenjpeg_decode_frame(AVCodecContext *avctx, opj_destroy_decompress(dec); return -1; } - width = image->comps[0].w << avctx->lowres; - height = image->comps[0].h << avctx->lowres; + width = image->x1 - image->x0; + height = image->y1 - image->y0; if(av_image_check_size(width, height, 0, avctx) < 0) { av_log(avctx, AV_LOG_ERROR, "%dx%d dimension invalid.\n", width, height); goto done; @@ -139,13 +148,30 @@ static int libopenjpeg_decode_frame(AVCodecContext *avctx, } if(picture->data[0]) - avctx->release_buffer(avctx, picture); + ff_thread_release_buffer(avctx, picture); + + if(ff_thread_get_buffer(avctx, picture) < 0){ + av_log(avctx, AV_LOG_ERROR, "ff_thread_get_buffer() failed\n"); + return -1; + } - if(avctx->get_buffer(avctx, picture) < 0) { - av_log(avctx, AV_LOG_ERROR, "Couldn't allocate image buffer.\n"); + ff_thread_finish_setup(avctx); + + ctx->dec_params.cp_limit_decoding = NO_LIMITATION; + ctx->dec_params.cp_reduce = avctx->lowres; + // Tie decoder with decoding parameters + opj_setup_decoder(dec, &ctx->dec_params); + stream = opj_cio_open((opj_common_ptr)dec, buf, buf_size); + if(!stream) { + av_log(avctx, AV_LOG_ERROR, "Codestream could not be opened for reading.\n"); + opj_destroy_decompress(dec); return -1; } + // Decode the codestream + image = opj_decode_with_info(dec, stream, NULL); + opj_cio_close(stream); + for(x = 0; x < image->numcomps; x++) { adjust[x] = FFMAX(image->comps[x].prec - 8, 0); } @@ -179,7 +205,7 @@ static av_cold int libopenjpeg_decode_close(AVCodecContext *avctx) LibOpenJPEGContext *ctx = avctx->priv_data; if(ctx->image.data[0]) - avctx->release_buffer(avctx, &ctx->image); + ff_thread_release_buffer(avctx, &ctx->image); return 0 ; } @@ -192,7 +218,8 @@ AVCodec ff_libopenjpeg_decoder = { .init = libopenjpeg_decode_init, .close = libopenjpeg_decode_close, .decode = libopenjpeg_decode_frame, - .capabilities = CODEC_CAP_DR1, + .capabilities = CODEC_CAP_DR1 | CODEC_CAP_FRAME_THREADS, .max_lowres = 5, .long_name = NULL_IF_CONFIG_SMALL("OpenJPEG based JPEG 2000 decoder"), + .init_thread_copy = ONLY_IF_THREADS_ENABLED(libopenjpeg_decode_init_thread_copy) }; From b034c95cc1a700b7d0849cedb1316989c3fb15be Mon Sep 17 00:00:00 2001 From: Mans Rullgard Date: Fri, 21 Oct 2011 12:34:08 +0100 Subject: [PATCH 11/35] h264: fix ppc/altivec build Signed-off-by: Mans Rullgard --- libavcodec/ppc/h264_altivec.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libavcodec/ppc/h264_altivec.c b/libavcodec/ppc/h264_altivec.c index edc043c3c7..3163a37d3b 100644 --- a/libavcodec/ppc/h264_altivec.c +++ b/libavcodec/ppc/h264_altivec.c @@ -956,11 +956,11 @@ void biweight_h264_W_altivec(uint8_t *dst, uint8_t *src, int stride, int height, #define H264_WEIGHT(W) \ static void ff_weight_h264_pixels ## W ## _altivec(uint8_t *block, int stride, int height, \ int log2_denom, int weight, int offset){ \ - weight_h264_WxH_altivec(block, stride, height, log2_denom, weight, offset, W); \ + weight_h264_W_altivec(block, stride, height, log2_denom, weight, offset, W); \ }\ static void ff_biweight_h264_pixels ## W ## _altivec(uint8_t *dst, uint8_t *src, int stride, int height, \ int log2_denom, int weightd, int weights, int offset){ \ - biweight_h264_WxH_altivec(dst, src, stride, height, log2_denom, weightd, weights, offset, W); \ + biweight_h264_W_altivec(dst, src, stride, height, log2_denom, weightd, weights, offset, W); \ } H264_WEIGHT(16) From ef74e3979930d99830e01a52b0e09f6997938696 Mon Sep 17 00:00:00 2001 From: Justin Ruggles Date: Thu, 20 Oct 2011 15:08:48 -0400 Subject: [PATCH 12/35] flvenc: store delay and last_ts per-stream. --- libavformat/flvenc.c | 34 ++++++++++++++++++++++------------ 1 file changed, 22 insertions(+), 12 deletions(-) diff --git a/libavformat/flvenc.c b/libavformat/flvenc.c index cf77157636..0d1fa7312a 100644 --- a/libavformat/flvenc.c +++ b/libavformat/flvenc.c @@ -57,10 +57,13 @@ typedef struct FLVContext { int64_t duration_offset; int64_t filesize_offset; int64_t duration; - int delay; ///< first dts delay for AVC - int64_t last_ts; } FLVContext; +typedef struct FLVStreamContext { + int delay; ///< first dts delay for each stream (needed for AVC & Speex) + int64_t last_ts; ///< last timestamp for each stream +} FLVStreamContext; + static int get_audio_flags(AVCodecContext *enc){ int flags = (enc->bits_per_coded_sample == 16) ? FLV_SAMPLESSIZE_16BIT : FLV_SAMPLESSIZE_8BIT; @@ -179,6 +182,7 @@ static int flv_write_header(AVFormatContext *s) for(i=0; inb_streams; i++){ AVCodecContext *enc = s->streams[i]->codec; + FLVStreamContext *sc; if (enc->codec_type == AVMEDIA_TYPE_VIDEO) { if (s->streams[i]->r_frame_rate.den && s->streams[i]->r_frame_rate.num) { framerate = av_q2d(s->streams[i]->r_frame_rate); @@ -196,6 +200,12 @@ static int flv_write_header(AVFormatContext *s) return -1; } av_set_pts_info(s->streams[i], 32, 1, 1000); /* 32 bit pts in ms */ + + sc = av_mallocz(sizeof(FLVStreamContext)); + if (!sc) + return AVERROR(ENOMEM); + s->streams[i]->priv_data = sc; + sc->last_ts = -1; } avio_write(pb, "FLV", 3); avio_w8(pb,1); @@ -215,8 +225,6 @@ static int flv_write_header(AVFormatContext *s) } } - flv->last_ts = -1; - /* write meta_tag */ avio_w8(pb, 18); // tag type META metadata_size_pos= avio_tell(pb); @@ -342,9 +350,10 @@ static int flv_write_trailer(AVFormatContext *s) /* Add EOS tag */ for (i = 0; i < s->nb_streams; i++) { AVCodecContext *enc = s->streams[i]->codec; + FLVStreamContext *sc = s->streams[i]->priv_data; if (enc->codec_type == AVMEDIA_TYPE_VIDEO && enc->codec_id == CODEC_ID_H264) { - put_avc_eos_tag(pb, flv->last_ts); + put_avc_eos_tag(pb, sc->last_ts); } } @@ -365,6 +374,7 @@ static int flv_write_packet(AVFormatContext *s, AVPacket *pkt) AVIOContext *pb = s->pb; AVCodecContext *enc = s->streams[pkt->stream_index]->codec; FLVContext *flv = s->priv_data; + FLVStreamContext *sc = s->streams[pkt->stream_index]->priv_data; unsigned ts; int size= pkt->size; uint8_t *data= NULL; @@ -406,20 +416,20 @@ static int flv_write_packet(AVFormatContext *s, AVPacket *pkt) return -1; } } - if (!flv->delay && pkt->dts < 0) - flv->delay = -pkt->dts; + if (!sc->delay && pkt->dts < 0) + sc->delay = -pkt->dts; - ts = pkt->dts + flv->delay; // add delay to force positive dts + ts = pkt->dts + sc->delay; // add delay to force positive dts /* check Speex packet duration */ - if (enc->codec_id == CODEC_ID_SPEEX && ts - flv->last_ts > 160) { + if (enc->codec_id == CODEC_ID_SPEEX && ts - sc->last_ts > 160) { av_log(s, AV_LOG_WARNING, "Warning: Speex stream has more than " "8 frames per packet. Adobe Flash " "Player cannot handle this!\n"); } - if (flv->last_ts < ts) - flv->last_ts = ts; + if (sc->last_ts < ts) + sc->last_ts = ts; avio_wb24(pb,size + flags_size); avio_wb24(pb,ts); @@ -440,7 +450,7 @@ static int flv_write_packet(AVFormatContext *s, AVPacket *pkt) avio_write(pb, data ? data : pkt->data, size); avio_wb32(pb,size+flags_size+11); // previous tag size - flv->duration = FFMAX(flv->duration, pkt->pts + flv->delay + pkt->duration); + flv->duration = FFMAX(flv->duration, pkt->pts + sc->delay + pkt->duration); avio_flush(pb); From 45add995de6a1458cd8095abb302f9a7cbd3e3ee Mon Sep 17 00:00:00 2001 From: Justin Ruggles Date: Sun, 9 Oct 2011 16:30:11 -0400 Subject: [PATCH 13/35] fmtconvert: fix and extend documentation for float_interleave() --- libavcodec/fmtconvert.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/libavcodec/fmtconvert.h b/libavcodec/fmtconvert.h index d7741135b7..1b534019f1 100644 --- a/libavcodec/fmtconvert.h +++ b/libavcodec/fmtconvert.h @@ -70,7 +70,15 @@ typedef struct FmtConvertContext { long len, int channels); /** - * Convert an array of interleaved float to multiple arrays of float. + * Convert multiple arrays of float to an array of interleaved float. + * + * @param dst destination array of interleaved float. + * constraints: 16-byte aligned + * @param src source array of float arrays, one for each channel. + * constraints: 16-byte aligned + * @param len number of elements to convert. + * constraints: multiple of 8 + * @param channels number of channels */ void (*float_interleave)(float *dst, const float **src, unsigned int len, int channels); From 708ab7dd69d5c98221882a8086f68f1bb02a44a3 Mon Sep 17 00:00:00 2001 From: Justin Ruggles Date: Sun, 9 Oct 2011 19:12:09 -0400 Subject: [PATCH 14/35] fmtconvert: port float_to_int16() x86 inline asm to yasm --- libavcodec/x86/fmtconvert.asm | 42 ++++++++++++++++ libavcodec/x86/fmtconvert_mmx.c | 89 +++++---------------------------- 2 files changed, 55 insertions(+), 76 deletions(-) diff --git a/libavcodec/x86/fmtconvert.asm b/libavcodec/x86/fmtconvert.asm index efab87d570..d314a4e14e 100644 --- a/libavcodec/x86/fmtconvert.asm +++ b/libavcodec/x86/fmtconvert.asm @@ -24,6 +24,48 @@ SECTION_TEXT +;------------------------------------------------------------------------------ +; void ff_float_to_int16(int16_t *dst, const float *src, long len); +;------------------------------------------------------------------------------ +%macro FLOAT_TO_INT16 2 +cglobal float_to_int16_%1, 3,3,%2, dst, src, len + add lenq, lenq + lea srcq, [srcq+2*lenq] + add dstq, lenq + neg lenq +.loop: +%ifidn %1, sse2 + cvtps2dq m0, [srcq+2*lenq ] + cvtps2dq m1, [srcq+2*lenq+16] + packssdw m0, m1 + mova [dstq+lenq], m0 +%else + cvtps2pi m0, [srcq+2*lenq ] + cvtps2pi m1, [srcq+2*lenq+ 8] + cvtps2pi m2, [srcq+2*lenq+16] + cvtps2pi m3, [srcq+2*lenq+24] + packssdw m0, m1 + packssdw m2, m3 + mova [dstq+lenq ], m0 + mova [dstq+lenq+8], m2 +%endif + add lenq, 16 + js .loop +%ifnidn %1, sse2 + emms +%endif + REP_RET +%endmacro + +INIT_XMM +FLOAT_TO_INT16 sse2, 2 +INIT_MMX +FLOAT_TO_INT16 sse, 0 +%define cvtps2pi pf2id +FLOAT_TO_INT16 3dnow, 0 +%undef cvtps2pi + + %macro PSWAPD_SSE 2 pshufw %1, %2, 0x4e %endmacro diff --git a/libavcodec/x86/fmtconvert_mmx.c b/libavcodec/x86/fmtconvert_mmx.c index 253f60bfc2..949dc973f3 100644 --- a/libavcodec/x86/fmtconvert_mmx.c +++ b/libavcodec/x86/fmtconvert_mmx.c @@ -70,80 +70,16 @@ static void int32_to_float_fmul_scalar_sse2(float *dst, const int *src, float mu ); } -static void float_to_int16_3dnow(int16_t *dst, const float *src, long len){ - x86_reg reglen = len; - // not bit-exact: pf2id uses different rounding than C and SSE - __asm__ volatile( - "add %0 , %0 \n\t" - "lea (%2,%0,2) , %2 \n\t" - "add %0 , %1 \n\t" - "neg %0 \n\t" - "1: \n\t" - "pf2id (%2,%0,2) , %%mm0 \n\t" - "pf2id 8(%2,%0,2) , %%mm1 \n\t" - "pf2id 16(%2,%0,2) , %%mm2 \n\t" - "pf2id 24(%2,%0,2) , %%mm3 \n\t" - "packssdw %%mm1 , %%mm0 \n\t" - "packssdw %%mm3 , %%mm2 \n\t" - "movq %%mm0 , (%1,%0) \n\t" - "movq %%mm2 , 8(%1,%0) \n\t" - "add $16 , %0 \n\t" - " js 1b \n\t" - "femms \n\t" - :"+r"(reglen), "+r"(dst), "+r"(src) - ); -} +#if HAVE_YASM -static void float_to_int16_sse(int16_t *dst, const float *src, long len){ - x86_reg reglen = len; - __asm__ volatile( - "add %0 , %0 \n\t" - "lea (%2,%0,2) , %2 \n\t" - "add %0 , %1 \n\t" - "neg %0 \n\t" - "1: \n\t" - "cvtps2pi (%2,%0,2) , %%mm0 \n\t" - "cvtps2pi 8(%2,%0,2) , %%mm1 \n\t" - "cvtps2pi 16(%2,%0,2) , %%mm2 \n\t" - "cvtps2pi 24(%2,%0,2) , %%mm3 \n\t" - "packssdw %%mm1 , %%mm0 \n\t" - "packssdw %%mm3 , %%mm2 \n\t" - "movq %%mm0 , (%1,%0) \n\t" - "movq %%mm2 , 8(%1,%0) \n\t" - "add $16 , %0 \n\t" - " js 1b \n\t" - "emms \n\t" - :"+r"(reglen), "+r"(dst), "+r"(src) - ); -} - -static void float_to_int16_sse2(int16_t *dst, const float *src, long len){ - x86_reg reglen = len; - __asm__ volatile( - "add %0 , %0 \n\t" - "lea (%2,%0,2) , %2 \n\t" - "add %0 , %1 \n\t" - "neg %0 \n\t" - "1: \n\t" - "cvtps2dq (%2,%0,2) , %%xmm0 \n\t" - "cvtps2dq 16(%2,%0,2) , %%xmm1 \n\t" - "packssdw %%xmm1 , %%xmm0 \n\t" - "movdqa %%xmm0 , (%1,%0) \n\t" - "add $16 , %0 \n\t" - " js 1b \n\t" - :"+r"(reglen), "+r"(dst), "+r"(src) - ); -} +void ff_float_to_int16_3dnow(int16_t *dst, const float *src, long len); +void ff_float_to_int16_sse (int16_t *dst, const float *src, long len); +void ff_float_to_int16_sse2 (int16_t *dst, const float *src, long len); void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len); void ff_float_to_int16_interleave6_3dnow(int16_t *dst, const float **src, int len); void ff_float_to_int16_interleave6_3dn2(int16_t *dst, const float **src, int len); -#if !HAVE_YASM -#define ff_float_to_int16_interleave6_sse(a,b,c) float_to_int16_interleave_misc_sse(a,b,c,6) -#define ff_float_to_int16_interleave6_3dnow(a,b,c) float_to_int16_interleave_misc_3dnow(a,b,c,6) -#define ff_float_to_int16_interleave6_3dn2(a,b,c) float_to_int16_interleave_misc_3dnow(a,b,c,6) -#endif #define ff_float_to_int16_interleave6_sse2 ff_float_to_int16_interleave6_sse #define FLOAT_TO_INT16_INTERLEAVE(cpu, body) \ @@ -152,7 +88,7 @@ static av_noinline void float_to_int16_interleave_misc_##cpu(int16_t *dst, const DECLARE_ALIGNED(16, int16_t, tmp)[len];\ int i,j,c;\ for(c=0; cfloat_interleave = float_interleave_mmx; -#endif if(mm_flags & AV_CPU_FLAG_3DNOW){ if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ - c->float_to_int16 = float_to_int16_3dnow; + c->float_to_int16 = ff_float_to_int16_3dnow; c->float_to_int16_interleave = float_to_int16_interleave_3dnow; } } @@ -285,18 +219,21 @@ void ff_fmt_convert_init_x86(FmtConvertContext *c, AVCodecContext *avctx) c->float_to_int16_interleave = float_to_int16_interleave_3dn2; } } +#endif if(mm_flags & AV_CPU_FLAG_SSE){ c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse; - c->float_to_int16 = float_to_int16_sse; - c->float_to_int16_interleave = float_to_int16_interleave_sse; #if HAVE_YASM + c->float_to_int16 = ff_float_to_int16_sse; + c->float_to_int16_interleave = float_to_int16_interleave_sse; c->float_interleave = float_interleave_sse; #endif } if(mm_flags & AV_CPU_FLAG_SSE2){ c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse2; - c->float_to_int16 = float_to_int16_sse2; +#if HAVE_YASM + c->float_to_int16 = ff_float_to_int16_sse2; c->float_to_int16_interleave = float_to_int16_interleave_sse2; +#endif } } } From 185142a5ea93ef723f70a3ea43797f6c8827eb79 Mon Sep 17 00:00:00 2001 From: Justin Ruggles Date: Sun, 9 Oct 2011 20:01:22 -0400 Subject: [PATCH 15/35] fmtconvert: check compile-time x86 instruction set flags --- libavcodec/x86/fmtconvert_mmx.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/libavcodec/x86/fmtconvert_mmx.c b/libavcodec/x86/fmtconvert_mmx.c index 949dc973f3..6e43280d66 100644 --- a/libavcodec/x86/fmtconvert_mmx.c +++ b/libavcodec/x86/fmtconvert_mmx.c @@ -208,19 +208,19 @@ void ff_fmt_convert_init_x86(FmtConvertContext *c, AVCodecContext *avctx) #if HAVE_YASM c->float_interleave = float_interleave_mmx; - if(mm_flags & AV_CPU_FLAG_3DNOW){ + if (HAVE_AMD3DNOW && mm_flags & AV_CPU_FLAG_3DNOW) { if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ c->float_to_int16 = ff_float_to_int16_3dnow; c->float_to_int16_interleave = float_to_int16_interleave_3dnow; } } - if(mm_flags & AV_CPU_FLAG_3DNOWEXT){ + if (HAVE_AMD3DNOWEXT && mm_flags & AV_CPU_FLAG_3DNOWEXT) { if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ c->float_to_int16_interleave = float_to_int16_interleave_3dn2; } } #endif - if(mm_flags & AV_CPU_FLAG_SSE){ + if (HAVE_SSE && mm_flags & AV_CPU_FLAG_SSE) { c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse; #if HAVE_YASM c->float_to_int16 = ff_float_to_int16_sse; @@ -228,7 +228,7 @@ void ff_fmt_convert_init_x86(FmtConvertContext *c, AVCodecContext *avctx) c->float_interleave = float_interleave_sse; #endif } - if(mm_flags & AV_CPU_FLAG_SSE2){ + if (HAVE_SSE && mm_flags & AV_CPU_FLAG_SSE2) { c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse2; #if HAVE_YASM c->float_to_int16 = ff_float_to_int16_sse2; From 4e8e2624767f4af0eaa932c543d072fed96fd586 Mon Sep 17 00:00:00 2001 From: Justin Ruggles Date: Sun, 9 Oct 2011 23:52:03 -0400 Subject: [PATCH 16/35] fmtconvert: port int32_to_float_fmul_scalar() x86 inline asm to yasm --- libavcodec/x86/dsputil_yasm.asm | 8 ----- libavcodec/x86/fmtconvert.asm | 46 +++++++++++++++++++++++++ libavcodec/x86/fmtconvert_mmx.c | 59 ++++----------------------------- libavutil/x86/x86util.asm | 12 +++++++ 4 files changed, 65 insertions(+), 60 deletions(-) diff --git a/libavcodec/x86/dsputil_yasm.asm b/libavcodec/x86/dsputil_yasm.asm index 2a2108404a..fe96d8b12b 100644 --- a/libavcodec/x86/dsputil_yasm.asm +++ b/libavcodec/x86/dsputil_yasm.asm @@ -1055,14 +1055,6 @@ emu_edge mmx ; int32_t max, unsigned int len) ;----------------------------------------------------------------------------- -%macro SPLATD_MMX 1 - punpckldq %1, %1 -%endmacro - -%macro SPLATD_SSE2 1 - pshufd %1, %1, 0 -%endmacro - %macro VECTOR_CLIP_INT32 4 cglobal vector_clip_int32_%1, 5,5,%2, dst, src, min, max, len %ifidn %1, sse2 diff --git a/libavcodec/x86/fmtconvert.asm b/libavcodec/x86/fmtconvert.asm index d314a4e14e..e3eb5d2286 100644 --- a/libavcodec/x86/fmtconvert.asm +++ b/libavcodec/x86/fmtconvert.asm @@ -24,6 +24,52 @@ SECTION_TEXT +;--------------------------------------------------------------------------------- +; void int32_to_float_fmul_scalar(float *dst, const int *src, float mul, int len); +;--------------------------------------------------------------------------------- +%macro INT32_TO_FLOAT_FMUL_SCALAR 2 +%ifdef ARCH_X86_64 +cglobal int32_to_float_fmul_scalar_%1, 3,3,%2, dst, src, len +%else +cglobal int32_to_float_fmul_scalar_%1, 4,4,%2, dst, src, mul, len + movss m0, mulm +%endif + SPLATD m0 + shl lenq, 2 + add srcq, lenq + add dstq, lenq + neg lenq +.loop: +%ifidn %1, sse2 + cvtdq2ps m1, [srcq+lenq ] + cvtdq2ps m2, [srcq+lenq+16] +%else + cvtpi2ps m1, [srcq+lenq ] + cvtpi2ps m3, [srcq+lenq+ 8] + cvtpi2ps m2, [srcq+lenq+16] + cvtpi2ps m4, [srcq+lenq+24] + movlhps m1, m3 + movlhps m2, m4 +%endif + mulps m1, m0 + mulps m2, m0 + mova [dstq+lenq ], m1 + mova [dstq+lenq+16], m2 + add lenq, 32 + jl .loop + REP_RET +%endmacro + +INIT_XMM +%define SPLATD SPLATD_SSE +%define movdqa movaps +INT32_TO_FLOAT_FMUL_SCALAR sse, 5 +%undef movdqa +%define SPLATD SPLATD_SSE2 +INT32_TO_FLOAT_FMUL_SCALAR sse2, 3 +%undef SPLATD + + ;------------------------------------------------------------------------------ ; void ff_float_to_int16(int16_t *dst, const float *src, long len); ;------------------------------------------------------------------------------ diff --git a/libavcodec/x86/fmtconvert_mmx.c b/libavcodec/x86/fmtconvert_mmx.c index 6e43280d66..86957b45ff 100644 --- a/libavcodec/x86/fmtconvert_mmx.c +++ b/libavcodec/x86/fmtconvert_mmx.c @@ -26,52 +26,11 @@ #include "libavutil/x86_cpu.h" #include "libavcodec/fmtconvert.h" -static void int32_to_float_fmul_scalar_sse(float *dst, const int *src, float mul, int len) -{ - x86_reg i = -4*len; - __asm__ volatile( - "movss %3, %%xmm4 \n" - "shufps $0, %%xmm4, %%xmm4 \n" - "1: \n" - "cvtpi2ps (%2,%0), %%xmm0 \n" - "cvtpi2ps 8(%2,%0), %%xmm1 \n" - "cvtpi2ps 16(%2,%0), %%xmm2 \n" - "cvtpi2ps 24(%2,%0), %%xmm3 \n" - "movlhps %%xmm1, %%xmm0 \n" - "movlhps %%xmm3, %%xmm2 \n" - "mulps %%xmm4, %%xmm0 \n" - "mulps %%xmm4, %%xmm2 \n" - "movaps %%xmm0, (%1,%0) \n" - "movaps %%xmm2, 16(%1,%0) \n" - "add $32, %0 \n" - "jl 1b \n" - :"+r"(i) - :"r"(dst+len), "r"(src+len), "m"(mul) - ); -} - -static void int32_to_float_fmul_scalar_sse2(float *dst, const int *src, float mul, int len) -{ - x86_reg i = -4*len; - __asm__ volatile( - "movss %3, %%xmm4 \n" - "shufps $0, %%xmm4, %%xmm4 \n" - "1: \n" - "cvtdq2ps (%2,%0), %%xmm0 \n" - "cvtdq2ps 16(%2,%0), %%xmm1 \n" - "mulps %%xmm4, %%xmm0 \n" - "mulps %%xmm4, %%xmm1 \n" - "movaps %%xmm0, (%1,%0) \n" - "movaps %%xmm1, 16(%1,%0) \n" - "add $32, %0 \n" - "jl 1b \n" - :"+r"(i) - :"r"(dst+len), "r"(src+len), "m"(mul) - ); -} - #if HAVE_YASM +void ff_int32_to_float_fmul_scalar_sse (float *dst, const int *src, float mul, int len); +void ff_int32_to_float_fmul_scalar_sse2(float *dst, const int *src, float mul, int len); + void ff_float_to_int16_3dnow(int16_t *dst, const float *src, long len); void ff_float_to_int16_sse (int16_t *dst, const float *src, long len); void ff_float_to_int16_sse2 (int16_t *dst, const float *src, long len); @@ -204,8 +163,8 @@ void ff_fmt_convert_init_x86(FmtConvertContext *c, AVCodecContext *avctx) { int mm_flags = av_get_cpu_flags(); - if (mm_flags & AV_CPU_FLAG_MMX) { #if HAVE_YASM + if (mm_flags & AV_CPU_FLAG_MMX) { c->float_interleave = float_interleave_mmx; if (HAVE_AMD3DNOW && mm_flags & AV_CPU_FLAG_3DNOW) { @@ -219,21 +178,17 @@ void ff_fmt_convert_init_x86(FmtConvertContext *c, AVCodecContext *avctx) c->float_to_int16_interleave = float_to_int16_interleave_3dn2; } } -#endif if (HAVE_SSE && mm_flags & AV_CPU_FLAG_SSE) { - c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse; -#if HAVE_YASM + c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_sse; c->float_to_int16 = ff_float_to_int16_sse; c->float_to_int16_interleave = float_to_int16_interleave_sse; c->float_interleave = float_interleave_sse; -#endif } if (HAVE_SSE && mm_flags & AV_CPU_FLAG_SSE2) { - c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse2; -#if HAVE_YASM + c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_sse2; c->float_to_int16 = ff_float_to_int16_sse2; c->float_to_int16_interleave = float_to_int16_interleave_sse2; -#endif } } +#endif } diff --git a/libavutil/x86/x86util.asm b/libavutil/x86/x86util.asm index 7e16c15db2..874443a2ef 100644 --- a/libavutil/x86/x86util.asm +++ b/libavutil/x86/x86util.asm @@ -536,6 +536,18 @@ %endif %endmacro +%macro SPLATD_MMX 1 + punpckldq %1, %1 +%endmacro + +%macro SPLATD_SSE 1 + shufps %1, %1, 0 +%endmacro + +%macro SPLATD_SSE2 1 + pshufd %1, %1, 0 +%endmacro + %macro CLIPW 3 ;(dst, min, max) pmaxsw %1, %2 pminsw %1, %3 From aad3429d4e34b74e4eb0b37b17f32804e217cf02 Mon Sep 17 00:00:00 2001 From: Justin Ruggles Date: Mon, 10 Oct 2011 00:43:08 -0400 Subject: [PATCH 17/35] fmtconvert: port float_to_int16_interleave() 2-channel x86 inline asm to yasm --- libavcodec/x86/fmtconvert.asm | 52 +++++++++++++++++++++++++ libavcodec/x86/fmtconvert_mmx.c | 69 +++++---------------------------- 2 files changed, 61 insertions(+), 60 deletions(-) diff --git a/libavcodec/x86/fmtconvert.asm b/libavcodec/x86/fmtconvert.asm index e3eb5d2286..854954835c 100644 --- a/libavcodec/x86/fmtconvert.asm +++ b/libavcodec/x86/fmtconvert.asm @@ -112,6 +112,58 @@ FLOAT_TO_INT16 3dnow, 0 %undef cvtps2pi +;------------------------------------------------------------------------------- +; void ff_float_to_int16_interleave2(int16_t *dst, const float **src, long len); +;------------------------------------------------------------------------------- +%macro FLOAT_TO_INT16_INTERLEAVE2 1 +cglobal float_to_int16_interleave2_%1, 3,4,2, dst, src0, src1, len + lea lenq, [4*r2q] + mov src1q, [src0q+gprsize] + mov src0q, [src0q] + add dstq, lenq + add src0q, lenq + add src1q, lenq + neg lenq +.loop: +%ifidn %1, sse2 + cvtps2dq m0, [src0q+lenq] + cvtps2dq m1, [src1q+lenq] + packssdw m0, m1 + movhlps m1, m0 + punpcklwd m0, m1 + mova [dstq+lenq], m0 +%else + cvtps2pi m0, [src0q+lenq ] + cvtps2pi m1, [src0q+lenq+8] + cvtps2pi m2, [src1q+lenq ] + cvtps2pi m3, [src1q+lenq+8] + packssdw m0, m1 + packssdw m2, m3 + mova m1, m0 + punpcklwd m0, m2 + punpckhwd m1, m2 + mova [dstq+lenq ], m0 + mova [dstq+lenq+8], m1 +%endif + add lenq, 16 + js .loop +%ifnidn %1, sse2 + emms +%endif + REP_RET +%endmacro + +INIT_MMX +%define cvtps2pi pf2id +FLOAT_TO_INT16_INTERLEAVE2 3dnow +%undef cvtps2pi +%define movdqa movaps +FLOAT_TO_INT16_INTERLEAVE2 sse +%undef movdqa +INIT_XMM +FLOAT_TO_INT16_INTERLEAVE2 sse2 + + %macro PSWAPD_SSE 2 pshufw %1, %2, 0x4e %endmacro diff --git a/libavcodec/x86/fmtconvert_mmx.c b/libavcodec/x86/fmtconvert_mmx.c index 86957b45ff..17079d3c82 100644 --- a/libavcodec/x86/fmtconvert_mmx.c +++ b/libavcodec/x86/fmtconvert_mmx.c @@ -35,13 +35,17 @@ void ff_float_to_int16_3dnow(int16_t *dst, const float *src, long len); void ff_float_to_int16_sse (int16_t *dst, const float *src, long len); void ff_float_to_int16_sse2 (int16_t *dst, const float *src, long len); +void ff_float_to_int16_interleave2_3dnow(int16_t *dst, const float **src, long len); +void ff_float_to_int16_interleave2_sse (int16_t *dst, const float **src, long len); +void ff_float_to_int16_interleave2_sse2 (int16_t *dst, const float **src, long len); + void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len); void ff_float_to_int16_interleave6_3dnow(int16_t *dst, const float **src, int len); void ff_float_to_int16_interleave6_3dn2(int16_t *dst, const float **src, int len); #define ff_float_to_int16_interleave6_sse2 ff_float_to_int16_interleave6_sse -#define FLOAT_TO_INT16_INTERLEAVE(cpu, body) \ +#define FLOAT_TO_INT16_INTERLEAVE(cpu) \ /* gcc pessimizes register allocation if this is in the same function as float_to_int16_interleave_sse2*/\ static av_noinline void float_to_int16_interleave_misc_##cpu(int16_t *dst, const float **src, long len, int channels){\ DECLARE_ALIGNED(16, int16_t, tmp)[len];\ @@ -57,71 +61,16 @@ static void float_to_int16_interleave_##cpu(int16_t *dst, const float **src, lon if(channels==1)\ ff_float_to_int16_##cpu(dst, src[0], len);\ else if(channels==2){\ - x86_reg reglen = len; \ - const float *src0 = src[0];\ - const float *src1 = src[1];\ - __asm__ volatile(\ - "shl $2, %0 \n"\ - "add %0, %1 \n"\ - "add %0, %2 \n"\ - "add %0, %3 \n"\ - "neg %0 \n"\ - body\ - :"+r"(reglen), "+r"(dst), "+r"(src0), "+r"(src1)\ - );\ + ff_float_to_int16_interleave2_##cpu(dst, src, len);\ }else if(channels==6){\ ff_float_to_int16_interleave6_##cpu(dst, src, len);\ }else\ float_to_int16_interleave_misc_##cpu(dst, src, len, channels);\ } -FLOAT_TO_INT16_INTERLEAVE(3dnow, - "1: \n" - "pf2id (%2,%0), %%mm0 \n" - "pf2id 8(%2,%0), %%mm1 \n" - "pf2id (%3,%0), %%mm2 \n" - "pf2id 8(%3,%0), %%mm3 \n" - "packssdw %%mm1, %%mm0 \n" - "packssdw %%mm3, %%mm2 \n" - "movq %%mm0, %%mm1 \n" - "punpcklwd %%mm2, %%mm0 \n" - "punpckhwd %%mm2, %%mm1 \n" - "movq %%mm0, (%1,%0)\n" - "movq %%mm1, 8(%1,%0)\n" - "add $16, %0 \n" - "js 1b \n" - "femms \n" -) - -FLOAT_TO_INT16_INTERLEAVE(sse, - "1: \n" - "cvtps2pi (%2,%0), %%mm0 \n" - "cvtps2pi 8(%2,%0), %%mm1 \n" - "cvtps2pi (%3,%0), %%mm2 \n" - "cvtps2pi 8(%3,%0), %%mm3 \n" - "packssdw %%mm1, %%mm0 \n" - "packssdw %%mm3, %%mm2 \n" - "movq %%mm0, %%mm1 \n" - "punpcklwd %%mm2, %%mm0 \n" - "punpckhwd %%mm2, %%mm1 \n" - "movq %%mm0, (%1,%0)\n" - "movq %%mm1, 8(%1,%0)\n" - "add $16, %0 \n" - "js 1b \n" - "emms \n" -) - -FLOAT_TO_INT16_INTERLEAVE(sse2, - "1: \n" - "cvtps2dq (%2,%0), %%xmm0 \n" - "cvtps2dq (%3,%0), %%xmm1 \n" - "packssdw %%xmm1, %%xmm0 \n" - "movhlps %%xmm0, %%xmm1 \n" - "punpcklwd %%xmm1, %%xmm0 \n" - "movdqa %%xmm0, (%1,%0) \n" - "add $16, %0 \n" - "js 1b \n" -) +FLOAT_TO_INT16_INTERLEAVE(3dnow) +FLOAT_TO_INT16_INTERLEAVE(sse) +FLOAT_TO_INT16_INTERLEAVE(sse2) static void float_to_int16_interleave_3dn2(int16_t *dst, const float **src, long len, int channels){ if(channels==6) From cb72230dfadb28651e036d717dc12d33b18a6893 Mon Sep 17 00:00:00 2001 From: Justin Ruggles Date: Sun, 25 Sep 2011 12:16:34 -0400 Subject: [PATCH 18/35] mp3on4: copy MPADSPContext from first context to all contexts. Fixes segfault when decoding multi-channel MP3onMP4 files. --- libavcodec/mpegaudiodec.c | 1 + 1 file changed, 1 insertion(+) diff --git a/libavcodec/mpegaudiodec.c b/libavcodec/mpegaudiodec.c index 2af05edc87..5d15d25e48 100644 --- a/libavcodec/mpegaudiodec.c +++ b/libavcodec/mpegaudiodec.c @@ -1959,6 +1959,7 @@ static int decode_init_mp3on4(AVCodecContext * avctx) s->mp3decctx[i] = av_mallocz(sizeof(MPADecodeContext)); s->mp3decctx[i]->adu_mode = 1; s->mp3decctx[i]->avctx = avctx; + s->mp3decctx[i]->mpadsp = s->mp3decctx[0]->mpadsp; } return 0; From f507dd067aec52b251f25e265cdb8b333db33b42 Mon Sep 17 00:00:00 2001 From: Justin Ruggles Date: Sun, 25 Sep 2011 12:30:16 -0400 Subject: [PATCH 19/35] mp3on4: allocate temp buffer with av_malloc() instead of on the stack. Avoids allocating unnecessary memory and ensures proper alignment. --- libavcodec/mpegaudiodec.c | 45 +++++++++++++++++++++++++-------------- 1 file changed, 29 insertions(+), 16 deletions(-) diff --git a/libavcodec/mpegaudiodec.c b/libavcodec/mpegaudiodec.c index 5d15d25e48..3bd7b02b9c 100644 --- a/libavcodec/mpegaudiodec.c +++ b/libavcodec/mpegaudiodec.c @@ -1894,6 +1894,7 @@ typedef struct MP3On4DecodeContext { int syncword; ///< syncword patch const uint8_t *coff; ///< channels offsets in output buffer MPADecodeContext *mp3decctx[5]; ///< MPADecodeContext for every decoder instance + OUT_INT *decoded_buf; ///< output buffer for decoded samples } MP3On4DecodeContext; #include "mpeg4audio.h" @@ -1913,6 +1914,20 @@ static const uint8_t chan_offset[8][5] = { }; +static av_cold int decode_close_mp3on4(AVCodecContext * avctx) +{ + MP3On4DecodeContext *s = avctx->priv_data; + int i; + + for (i = 0; i < s->frames; i++) + av_free(s->mp3decctx[i]); + + av_freep(&s->decoded_buf); + + return 0; +} + + static int decode_init_mp3on4(AVCodecContext * avctx) { MP3On4DecodeContext *s = avctx->priv_data; @@ -1962,19 +1977,18 @@ static int decode_init_mp3on4(AVCodecContext * avctx) s->mp3decctx[i]->mpadsp = s->mp3decctx[0]->mpadsp; } - return 0; -} - - -static av_cold int decode_close_mp3on4(AVCodecContext * avctx) -{ - MP3On4DecodeContext *s = avctx->priv_data; - int i; - - for (i = 0; i < s->frames; i++) - av_free(s->mp3decctx[i]); + /* Allocate buffer for multi-channel output if needed */ + if (s->frames > 1) { + s->decoded_buf = av_malloc(MPA_FRAME_SIZE * MPA_MAX_CHANNELS * + sizeof(*s->decoded_buf)); + if (!s->decoded_buf) + goto alloc_fail; + } return 0; +alloc_fail: + decode_close_mp3on4(avctx); + return AVERROR(ENOMEM); } @@ -1989,7 +2003,6 @@ static int decode_frame_mp3on4(AVCodecContext * avctx, int fsize, len = buf_size, out_size = 0; uint32_t header; OUT_INT *out_samples = data; - OUT_INT decoded_buf[MPA_FRAME_SIZE * MPA_MAX_CHANNELS]; OUT_INT *outptr, *bp; int fr, j, n; @@ -2002,7 +2015,7 @@ static int decode_frame_mp3on4(AVCodecContext * avctx, return -1; // If only one decoder interleave is not needed - outptr = s->frames == 1 ? out_samples : decoded_buf; + outptr = s->frames == 1 ? out_samples : s->decoded_buf; avctx->bit_rate = 0; @@ -2028,13 +2041,13 @@ static int decode_frame_mp3on4(AVCodecContext * avctx, bp = out_samples + s->coff[fr]; if(m->nb_channels == 1) { for(j = 0; j < n; j++) { - *bp = decoded_buf[j]; + *bp = s->decoded_buf[j]; bp += avctx->channels; } } else { for(j = 0; j < n; j++) { - bp[0] = decoded_buf[j++]; - bp[1] = decoded_buf[j]; + bp[0] = s->decoded_buf[j++]; + bp[1] = s->decoded_buf[j]; bp += avctx->channels; } } From fff0f831e0c8ccf87a6374f4bb349ac668bce14e Mon Sep 17 00:00:00 2001 From: Justin Ruggles Date: Sun, 25 Sep 2011 12:46:54 -0400 Subject: [PATCH 20/35] mp3on4: fix the output channel order --- libavcodec/mpegaudiodec.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/libavcodec/mpegaudiodec.c b/libavcodec/mpegaudiodec.c index 3bd7b02b9c..57dadedca1 100644 --- a/libavcodec/mpegaudiodec.c +++ b/libavcodec/mpegaudiodec.c @@ -1901,16 +1901,16 @@ typedef struct MP3On4DecodeContext { /* Next 3 arrays are indexed by channel config number (passed via codecdata) */ static const uint8_t mp3Frames[8] = {0,1,1,2,3,3,4,5}; /* number of mp3 decoder instances */ -/* offsets into output buffer, assume output order is FL FR BL BR C LFE */ +/* offsets into output buffer, assume output order is FL FR C LFE BL BR SL SR */ static const uint8_t chan_offset[8][5] = { {0}, {0}, // C {0}, // FLR {2,0}, // C FLR {2,0,3}, // C FLR BS - {4,0,2}, // C FLR BLRS - {4,0,2,5}, // C FLR BLRS LFE - {4,0,2,6,5}, // C FLR BLRS BLR LFE + {2,0,3}, // C FLR BLRS + {2,0,4,3}, // C FLR BLRS LFE + {2,0,6,4,3}, // C FLR BLRS BLR LFE }; From 1183d6cd98da7d1e9f751a68d288b200240ed335 Mon Sep 17 00:00:00 2001 From: Justin Ruggles Date: Sun, 25 Sep 2011 12:52:11 -0400 Subject: [PATCH 21/35] mp3on4: set channel layout --- libavcodec/mpegaudiodec.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/libavcodec/mpegaudiodec.c b/libavcodec/mpegaudiodec.c index 57dadedca1..f2728585f7 100644 --- a/libavcodec/mpegaudiodec.c +++ b/libavcodec/mpegaudiodec.c @@ -1913,6 +1913,17 @@ static const uint8_t chan_offset[8][5] = { {2,0,6,4,3}, // C FLR BLRS BLR LFE }; +/* mp3on4 channel layouts */ +static const int16_t chan_layout[8] = { + 0, + AV_CH_LAYOUT_MONO, + AV_CH_LAYOUT_STEREO, + AV_CH_LAYOUT_SURROUND, + AV_CH_LAYOUT_4POINT0, + AV_CH_LAYOUT_5POINT0, + AV_CH_LAYOUT_5POINT1, + AV_CH_LAYOUT_7POINT1 +}; static av_cold int decode_close_mp3on4(AVCodecContext * avctx) { @@ -1947,6 +1958,7 @@ static int decode_init_mp3on4(AVCodecContext * avctx) s->frames = mp3Frames[cfg.chan_config]; s->coff = chan_offset[cfg.chan_config]; avctx->channels = ff_mpeg4audio_channels[cfg.chan_config]; + avctx->channel_layout = chan_layout[cfg.chan_config]; if (cfg.sample_rate < 16000) s->syncword = 0xffe00000; From 53c8443ad2376a50c76e5d7c69435bd01b0abc42 Mon Sep 17 00:00:00 2001 From: Justin Ruggles Date: Sun, 25 Sep 2011 13:04:39 -0400 Subject: [PATCH 22/35] mp3on4: ensure that the frame channel count does not exceed the codec channel count. This also allows for checking output data size based on the actual number of channel instead of the maximum number of channels. --- libavcodec/mpegaudiodec.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/libavcodec/mpegaudiodec.c b/libavcodec/mpegaudiodec.c index f2728585f7..c3c6ee3805 100644 --- a/libavcodec/mpegaudiodec.c +++ b/libavcodec/mpegaudiodec.c @@ -2016,10 +2016,12 @@ static int decode_frame_mp3on4(AVCodecContext * avctx, uint32_t header; OUT_INT *out_samples = data; OUT_INT *outptr, *bp; - int fr, j, n; + int fr, j, n, ch; - if(*data_size < MPA_FRAME_SIZE * MPA_MAX_CHANNELS * s->frames * sizeof(OUT_INT)) - return -1; + if (*data_size < MPA_FRAME_SIZE * avctx->channels * sizeof(OUT_INT)) { + av_log(avctx, AV_LOG_ERROR, "output buffer is too small\n"); + return AVERROR(EINVAL); + } *data_size = 0; // Discard too short frames @@ -2031,6 +2033,7 @@ static int decode_frame_mp3on4(AVCodecContext * avctx, avctx->bit_rate = 0; + ch = 0; for (fr = 0; fr < s->frames; fr++) { fsize = AV_RB16(buf) >> 4; fsize = FFMIN3(fsize, len, MPA_MAX_CODED_FRAME_SIZE); @@ -2043,6 +2046,14 @@ static int decode_frame_mp3on4(AVCodecContext * avctx, break; avpriv_mpegaudio_decode_header((MPADecodeHeader *)m, header); + + if (ch + m->nb_channels > avctx->channels) { + av_log(avctx, AV_LOG_ERROR, "frame channel count exceeds codec " + "channel count\n"); + return AVERROR_INVALIDDATA; + } + ch += m->nb_channels; + out_size += mp_decode_frame(m, outptr, buf, fsize); buf += fsize; len -= fsize; From 180bf988bc524f4775dd4765f07816df324e808b Mon Sep 17 00:00:00 2001 From: Justin Ruggles Date: Sun, 25 Sep 2011 13:39:04 -0400 Subject: [PATCH 23/35] mp3on4: create a separate flush function for MP3onMP4. The correct decoder private context needs to be used. This fixes mp3on4 playback and seeking in avplay. --- libavcodec/mpegaudiodec.c | 15 ++++++++++++++- libavcodec/mpegaudiodec_float.c | 2 +- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/libavcodec/mpegaudiodec.c b/libavcodec/mpegaudiodec.c index c3c6ee3805..040b1090d6 100644 --- a/libavcodec/mpegaudiodec.c +++ b/libavcodec/mpegaudiodec.c @@ -2004,6 +2004,19 @@ alloc_fail: } +static void flush_mp3on4(AVCodecContext *avctx) +{ + int i; + MP3On4DecodeContext *s = avctx->priv_data; + + for (i = 0; i < s->frames; i++) { + MPADecodeContext *m = s->mp3decctx[i]; + memset(m->synth_buf, 0, sizeof(m->synth_buf)); + m->last_buf_size = 0; + } +} + + static int decode_frame_mp3on4(AVCodecContext * avctx, void *data, int *data_size, AVPacket *avpkt) @@ -2148,7 +2161,7 @@ AVCodec ff_mp3on4_decoder = { .init = decode_init_mp3on4, .close = decode_close_mp3on4, .decode = decode_frame_mp3on4, - .flush = flush, + .flush = flush_mp3on4, .long_name = NULL_IF_CONFIG_SMALL("MP3onMP4"), }; #endif diff --git a/libavcodec/mpegaudiodec_float.c b/libavcodec/mpegaudiodec_float.c index 929d72738b..7f512500b3 100644 --- a/libavcodec/mpegaudiodec_float.c +++ b/libavcodec/mpegaudiodec_float.c @@ -83,7 +83,7 @@ AVCodec ff_mp3on4float_decoder = { .init = decode_init_mp3on4, .close = decode_close_mp3on4, .decode = decode_frame_mp3on4, - .flush = flush, + .flush = flush_mp3on4, .long_name = NULL_IF_CONFIG_SMALL("MP3onMP4"), }; #endif From 95891804bf300b266aa5328f1c338c046720e658 Mon Sep 17 00:00:00 2001 From: Justin Ruggles Date: Sun, 25 Sep 2011 14:32:42 -0400 Subject: [PATCH 24/35] mp3on4: check for allocation failures in decode_init_mp3on4() --- libavcodec/mpegaudiodec.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/libavcodec/mpegaudiodec.c b/libavcodec/mpegaudiodec.c index 040b1090d6..2b357b5412 100644 --- a/libavcodec/mpegaudiodec.c +++ b/libavcodec/mpegaudiodec.c @@ -1972,6 +1972,8 @@ static int decode_init_mp3on4(AVCodecContext * avctx) */ // Allocate zeroed memory for the first decoder context s->mp3decctx[0] = av_mallocz(sizeof(MPADecodeContext)); + if (!s->mp3decctx[0]) + goto alloc_fail; // Put decoder context in place to make init_decode() happy avctx->priv_data = s->mp3decctx[0]; decode_init(avctx); @@ -1984,6 +1986,8 @@ static int decode_init_mp3on4(AVCodecContext * avctx) */ for (i = 1; i < s->frames; i++) { s->mp3decctx[i] = av_mallocz(sizeof(MPADecodeContext)); + if (!s->mp3decctx[i]) + goto alloc_fail; s->mp3decctx[i]->adu_mode = 1; s->mp3decctx[i]->avctx = avctx; s->mp3decctx[i]->mpadsp = s->mp3decctx[0]->mpadsp; From 94395fbf8c85d6139115d8b20cce19476ed87806 Mon Sep 17 00:00:00 2001 From: Raivo Hool Date: Fri, 21 Oct 2011 16:51:33 +0300 Subject: [PATCH 25/35] mov: parse the gnre atom Signed-off-by: Anton Khirnov --- libavformat/mov.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/libavformat/mov.c b/libavformat/mov.c index 6baddebb82..3c551c63aa 100644 --- a/libavformat/mov.c +++ b/libavformat/mov.c @@ -35,6 +35,7 @@ #include "riff.h" #include "isom.h" #include "libavcodec/get_bits.h" +#include "id3v1.h" #if CONFIG_ZLIB #include @@ -126,6 +127,23 @@ static int mov_metadata_stik(MOVContext *c, AVIOContext *pb, return 0; } +static int mov_metadata_gnre(MOVContext *c, AVIOContext *pb, + unsigned len, const char *key) +{ + short genre; + char buf[20]; + + avio_r8(pb); // unknown + + genre = avio_r8(pb); + if (genre < 1 || genre > ID3v1_GENRE_MAX) + return 0; + snprintf(buf, sizeof(buf), "%s", ff_id3v1_genre_str[genre-1]); + av_dict_set(&c->fc->metadata, key, buf, 0); + + return 0; +} + static const uint32_t mac_to_unicode[128] = { 0x00C4,0x00C5,0x00C7,0x00C9,0x00D1,0x00D6,0x00DC,0x00E1, 0x00E0,0x00E2,0x00E4,0x00E3,0x00E5,0x00E7,0x00E9,0x00E8, @@ -187,6 +205,8 @@ static int mov_read_udta_string(MOVContext *c, AVIOContext *pb, MOVAtom atom) case MKTAG(0xa9,'a','l','b'): key = "album"; break; case MKTAG(0xa9,'d','a','y'): key = "date"; break; case MKTAG(0xa9,'g','e','n'): key = "genre"; break; + case MKTAG( 'g','n','r','e'): key = "genre"; + parse = mov_metadata_gnre; break; case MKTAG(0xa9,'t','o','o'): case MKTAG(0xa9,'s','w','r'): key = "encoder"; break; case MKTAG(0xa9,'e','n','c'): key = "encoder"; break; From 80951f5cf6cd34b317857c710d8cce09b0b73c4f Mon Sep 17 00:00:00 2001 From: Raivo Hool Date: Fri, 21 Oct 2011 16:04:13 +0300 Subject: [PATCH 26/35] mov: rename function _int8 to remove ambiguity, some indentation cosmetics Signed-off-by: Anton Khirnov --- libavformat/mov.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/libavformat/mov.c b/libavformat/mov.c index 3c551c63aa..84bd4ed9c8 100644 --- a/libavformat/mov.c +++ b/libavformat/mov.c @@ -100,20 +100,20 @@ static int mov_metadata_track_or_disc_number(MOVContext *c, AVIOContext *pb, return 0; } -static int mov_metadata_int8(MOVContext *c, AVIOContext *pb, - unsigned len, const char *key) +static int mov_metadata_int8_bypass_padding(MOVContext *c, AVIOContext *pb, + unsigned len, const char *key) { - char buf[16]; + char buf[16]; - /* bypass padding bytes */ - avio_r8(pb); - avio_r8(pb); - avio_r8(pb); + /* bypass padding bytes */ + avio_r8(pb); + avio_r8(pb); + avio_r8(pb); - snprintf(buf, sizeof(buf), "%hu", avio_r8(pb)); - av_dict_set(&c->fc->metadata, key, buf, 0); + snprintf(buf, sizeof(buf), "%hu", avio_r8(pb)); + av_dict_set(&c->fc->metadata, key, buf, 0); - return 0; + return 0; } static int mov_metadata_stik(MOVContext *c, AVIOContext *pb, @@ -220,9 +220,9 @@ static int mov_read_udta_string(MOVContext *c, AVIOContext *pb, MOVAtom atom) case MKTAG( 'd','i','s','k'): key = "disc"; parse = mov_metadata_track_or_disc_number; break; case MKTAG( 't','v','e','s'): key = "episode_sort"; - parse = mov_metadata_int8; break; + parse = mov_metadata_int8_bypass_padding; break; case MKTAG( 't','v','s','n'): key = "season_number"; - parse = mov_metadata_int8; break; + parse = mov_metadata_int8_bypass_padding; break; case MKTAG( 's','t','i','k'): key = "media_type"; parse = mov_metadata_stik; break; } From 5da35d1cb37fbaf0c6233955ec1934216d75a3bc Mon Sep 17 00:00:00 2001 From: Raivo Hool Date: Fri, 21 Oct 2011 16:04:14 +0300 Subject: [PATCH 27/35] mov: rename function _stik, some indentation cosmetics Signed-off-by: Anton Khirnov --- libavformat/mov.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/libavformat/mov.c b/libavformat/mov.c index 84bd4ed9c8..3a00b4679f 100644 --- a/libavformat/mov.c +++ b/libavformat/mov.c @@ -116,15 +116,15 @@ static int mov_metadata_int8_bypass_padding(MOVContext *c, AVIOContext *pb, return 0; } -static int mov_metadata_stik(MOVContext *c, AVIOContext *pb, - unsigned len, const char *key) +static int mov_metadata_int8_no_padding(MOVContext *c, AVIOContext *pb, + unsigned len, const char *key) { - char buf[16]; + char buf[16]; - snprintf(buf, sizeof(buf), "%hu", avio_r8(pb)); - av_dict_set(&c->fc->metadata, key, buf, 0); + snprintf(buf, sizeof(buf), "%hu", avio_r8(pb)); + av_dict_set(&c->fc->metadata, key, buf, 0); - return 0; + return 0; } static int mov_metadata_gnre(MOVContext *c, AVIOContext *pb, @@ -224,7 +224,7 @@ static int mov_read_udta_string(MOVContext *c, AVIOContext *pb, MOVAtom atom) case MKTAG( 't','v','s','n'): key = "season_number"; parse = mov_metadata_int8_bypass_padding; break; case MKTAG( 's','t','i','k'): key = "media_type"; - parse = mov_metadata_stik; break; + parse = mov_metadata_int8_no_padding; break; } if (c->itunes_metadata && atom.size > 8) { From b06df7075590b7954900a9ef5e2dd0e7e832544c Mon Sep 17 00:00:00 2001 From: Raivo Hool Date: Fri, 21 Oct 2011 16:04:15 +0300 Subject: [PATCH 28/35] mov: add support for hdvd and pgapmetadata atoms Signed-off-by: Anton Khirnov --- libavformat/mov.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/libavformat/mov.c b/libavformat/mov.c index 3a00b4679f..1747bd41a2 100644 --- a/libavformat/mov.c +++ b/libavformat/mov.c @@ -225,6 +225,10 @@ static int mov_read_udta_string(MOVContext *c, AVIOContext *pb, MOVAtom atom) parse = mov_metadata_int8_bypass_padding; break; case MKTAG( 's','t','i','k'): key = "media_type"; parse = mov_metadata_int8_no_padding; break; + case MKTAG( 'h','d','v','d'): key = "hd_video"; + parse = mov_metadata_int8_no_padding; break; + case MKTAG( 'p','g','a','p'): key = "gapless_playback"; + parse = mov_metadata_int8_no_padding; break; } if (c->itunes_metadata && atom.size > 8) { From 5dd35b43f1cd3dddaddaae8e2f267117b5fa2d54 Mon Sep 17 00:00:00 2001 From: Anton Khirnov Date: Fri, 21 Oct 2011 11:47:39 +0200 Subject: [PATCH 29/35] Move timefilter code from lavf to lavd. It's only used in the JACK device. Fixes linking shared lavd with JACK enabled. --- libavdevice/Makefile | 4 +++- libavdevice/jack_audio.c | 2 +- {libavformat => libavdevice}/timefilter.c | 2 +- {libavformat => libavdevice}/timefilter.h | 6 +++--- libavformat/Makefile | 5 +---- 5 files changed, 9 insertions(+), 10 deletions(-) rename {libavformat => libavdevice}/timefilter.c (99%) rename {libavformat => libavdevice}/timefilter.h (97%) diff --git a/libavdevice/Makefile b/libavdevice/Makefile index d8a5945549..1f2a6efceb 100644 --- a/libavdevice/Makefile +++ b/libavdevice/Makefile @@ -13,7 +13,7 @@ OBJS-$(CONFIG_ALSA_OUTDEV) += alsa-audio-common.o \ OBJS-$(CONFIG_BKTR_INDEV) += bktr.o OBJS-$(CONFIG_DV1394_INDEV) += dv1394.o OBJS-$(CONFIG_FBDEV_INDEV) += fbdev.o -OBJS-$(CONFIG_JACK_INDEV) += jack_audio.o +OBJS-$(CONFIG_JACK_INDEV) += jack_audio.o timefilter.o OBJS-$(CONFIG_OSS_INDEV) += oss_audio.o OBJS-$(CONFIG_OSS_OUTDEV) += oss_audio.o OBJS-$(CONFIG_SNDIO_INDEV) += sndio_common.o sndio_dec.o @@ -30,4 +30,6 @@ OBJS-$(CONFIG_LIBDC1394_INDEV) += libdc1394.o SKIPHEADERS-$(HAVE_ALSA_ASOUNDLIB_H) += alsa-audio.h SKIPHEADERS-$(HAVE_SNDIO_H) += sndio_common.h +TESTPROGS = timefilter + include $(SRC_PATH)/subdir.mak diff --git a/libavdevice/jack_audio.c b/libavdevice/jack_audio.c index 4907e82395..f75c176be9 100644 --- a/libavdevice/jack_audio.c +++ b/libavdevice/jack_audio.c @@ -29,7 +29,7 @@ #include "libavutil/opt.h" #include "libavcodec/avcodec.h" #include "libavformat/avformat.h" -#include "libavformat/timefilter.h" +#include "timefilter.h" /** * Size of the internal FIFO buffers as a number of audio packets diff --git a/libavformat/timefilter.c b/libavdevice/timefilter.c similarity index 99% rename from libavformat/timefilter.c rename to libavdevice/timefilter.c index 4860a4ff70..332d33b5e8 100644 --- a/libavformat/timefilter.c +++ b/libavdevice/timefilter.c @@ -24,8 +24,8 @@ #include "config.h" -#include "avformat.h" #include "timefilter.h" +#include "libavutil/mem.h" struct TimeFilter { /// Delay Locked Loop data. These variables refer to mathematical diff --git a/libavformat/timefilter.h b/libavdevice/timefilter.h similarity index 97% rename from libavformat/timefilter.h rename to libavdevice/timefilter.h index aa7db533b4..c98fd03bba 100644 --- a/libavformat/timefilter.h +++ b/libavdevice/timefilter.h @@ -22,8 +22,8 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ -#ifndef AVFORMAT_TIMEFILTER_H -#define AVFORMAT_TIMEFILTER_H +#ifndef AVDEVICE_TIMEFILTER_H +#define AVDEVICE_TIMEFILTER_H /** * Opaque type representing a time filter state @@ -94,4 +94,4 @@ void ff_timefilter_reset(TimeFilter *); */ void ff_timefilter_destroy(TimeFilter *); -#endif /* AVFORMAT_TIMEFILTER_H */ +#endif /* AVDEVICE_TIMEFILTER_H */ diff --git a/libavformat/Makefile b/libavformat/Makefile index 0a30c6ec1a..6973b15995 100644 --- a/libavformat/Makefile +++ b/libavformat/Makefile @@ -334,11 +334,8 @@ OBJS-$(CONFIG_RTP_PROTOCOL) += rtpproto.o OBJS-$(CONFIG_TCP_PROTOCOL) += tcp.o OBJS-$(CONFIG_UDP_PROTOCOL) += udp.o -# libavdevice dependencies -OBJS-$(CONFIG_JACK_INDEV) += timefilter.o - EXAMPLES = metadata output -TESTPROGS = seek timefilter +TESTPROGS = seek TOOLS = pktdumper probetest include $(SRC_PATH)/subdir.mak From 41ac093f7e315e3af17612f580c387b3688f4f43 Mon Sep 17 00:00:00 2001 From: Mans Rullgard Date: Fri, 21 Oct 2011 20:36:11 +0100 Subject: [PATCH 30/35] swscale: fix signed shift overflows in ff_yuv2rgb_c_init_tables() Signed-off-by: Mans Rullgard --- libswscale/yuv2rgb.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libswscale/yuv2rgb.c b/libswscale/yuv2rgb.c index cad09338d3..39c8b9c6fb 100644 --- a/libswscale/yuv2rgb.c +++ b/libswscale/yuv2rgb.c @@ -788,8 +788,8 @@ av_cold int ff_yuv2rgb_c_init_tables(SwsContext *c, const int inv_table[4], int y_table32 = c->yuvTable; yb = -(384<<16) - oy; for (i = 0; i < 1024; i++) { - uint8_t yval = av_clip_uint8((yb + 0x8000) >> 16); - y_table32[i ] = (yval << rbase) + (needAlpha ? 0 : (255 << abase)); + unsigned yval = av_clip_uint8((yb + 0x8000) >> 16); + y_table32[i ] = (yval << rbase) + (needAlpha ? 0 : (255u << abase)); y_table32[i+1024] = yval << gbase; y_table32[i+2048] = yval << bbase; yb += cy; From 7eeaa6796b647f8ef46c234e4f80a7dd5591ee71 Mon Sep 17 00:00:00 2001 From: Justin Ruggles Date: Fri, 21 Oct 2011 12:07:42 -0400 Subject: [PATCH 31/35] libspeexdec: decode one frame at a time. This allows for knowing the output size before decoding even when there is no header (e.g. FLV). Otherwise we would have to do a preliminary full frame decode to determine the number of frames-per-packet. --- libavcodec/libspeexdec.c | 54 ++++++++++++++++++++++++++-------------- 1 file changed, 36 insertions(+), 18 deletions(-) diff --git a/libavcodec/libspeexdec.c b/libavcodec/libspeexdec.c index cda987ca6a..fc90308065 100644 --- a/libavcodec/libspeexdec.c +++ b/libavcodec/libspeexdec.c @@ -99,32 +99,42 @@ static int libspeex_decode_frame(AVCodecContext *avctx, uint8_t *buf = avpkt->data; int buf_size = avpkt->size; LibSpeexContext *s = avctx->priv_data; - int16_t *output = data, *end; - int i, num_samples; - - num_samples = s->frame_size * avctx->channels; - end = output + *data_size / sizeof(*output); + int16_t *output = data; + int out_size, ret, consumed = 0; + + /* check output buffer size */ + out_size = s->frame_size * avctx->channels * + av_get_bytes_per_sample(avctx->sample_fmt); + if (*data_size < out_size) { + av_log(avctx, AV_LOG_ERROR, "Output buffer is too small\n"); + return AVERROR(EINVAL); + } - speex_bits_read_from(&s->bits, buf, buf_size); + /* if there is not enough data left for the smallest possible frame, + reset the libspeex buffer using the current packet, otherwise ignore + the current packet and keep decoding frames from the libspeex buffer. */ + if (speex_bits_remaining(&s->bits) < 43) { + /* check for flush packet */ + if (!buf || !buf_size) { + *data_size = 0; + return buf_size; + } + /* set new buffer */ + speex_bits_read_from(&s->bits, buf, buf_size); + consumed = buf_size; + } - for (i = 0; speex_bits_remaining(&s->bits) && output + num_samples < end; i++) { - int ret = speex_decode_int(s->dec_state, &s->bits, output); + /* decode a single frame */ + ret = speex_decode_int(s->dec_state, &s->bits, output); if (ret <= -2) { av_log(avctx, AV_LOG_ERROR, "Error decoding Speex frame.\n"); return -1; - } else if (ret == -1) - // end of stream - break; - + } if (avctx->channels == 2) speex_decode_stereo_int(output, s->frame_size, &s->stereo); - output += num_samples; - } - - avctx->frame_size = s->frame_size * i; - *data_size = avctx->channels * avctx->frame_size * sizeof(*output); - return buf_size; + *data_size = out_size; + return consumed; } static av_cold int libspeex_decode_close(AVCodecContext *avctx) @@ -138,6 +148,12 @@ static av_cold int libspeex_decode_close(AVCodecContext *avctx) return 0; } +static av_cold void libspeex_decode_flush(AVCodecContext *avctx) +{ + LibSpeexContext *s = avctx->priv_data; + speex_bits_reset(&s->bits); +} + AVCodec ff_libspeex_decoder = { .name = "libspeex", .type = AVMEDIA_TYPE_AUDIO, @@ -146,5 +162,7 @@ AVCodec ff_libspeex_decoder = { .init = libspeex_decode_init, .close = libspeex_decode_close, .decode = libspeex_decode_frame, + .flush = libspeex_decode_flush, + .capabilities = CODEC_CAP_SUBFRAMES | CODEC_CAP_DELAY, .long_name = NULL_IF_CONFIG_SMALL("libspeex Speex"), }; From 14bc60dbaeb10cb95bd47902067984de88e0315e Mon Sep 17 00:00:00 2001 From: Justin Ruggles Date: Fri, 21 Oct 2011 12:10:35 -0400 Subject: [PATCH 32/35] libspeexdec: cosmetics: reindent --- libavcodec/libspeexdec.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/libavcodec/libspeexdec.c b/libavcodec/libspeexdec.c index fc90308065..69742297f0 100644 --- a/libavcodec/libspeexdec.c +++ b/libavcodec/libspeexdec.c @@ -125,13 +125,13 @@ static int libspeex_decode_frame(AVCodecContext *avctx, } /* decode a single frame */ - ret = speex_decode_int(s->dec_state, &s->bits, output); - if (ret <= -2) { - av_log(avctx, AV_LOG_ERROR, "Error decoding Speex frame.\n"); - return -1; - } - if (avctx->channels == 2) - speex_decode_stereo_int(output, s->frame_size, &s->stereo); + ret = speex_decode_int(s->dec_state, &s->bits, output); + if (ret <= -2) { + av_log(avctx, AV_LOG_ERROR, "Error decoding Speex frame.\n"); + return -1; + } + if (avctx->channels == 2) + speex_decode_stereo_int(output, s->frame_size, &s->stereo); *data_size = out_size; return consumed; From a470fe80ba21513c29e319d968f87f1379a97d16 Mon Sep 17 00:00:00 2001 From: Justin Ruggles Date: Fri, 21 Oct 2011 12:13:04 -0400 Subject: [PATCH 33/35] libspeexdec: return meaningful error codes --- libavcodec/libspeexdec.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/libavcodec/libspeexdec.c b/libavcodec/libspeexdec.c index 69742297f0..f66331ea93 100644 --- a/libavcodec/libspeexdec.c +++ b/libavcodec/libspeexdec.c @@ -60,14 +60,14 @@ static av_cold int libspeex_decode_init(AVCodecContext *avctx) mode = speex_lib_get_mode(s->header->mode); if (!mode) { av_log(avctx, AV_LOG_ERROR, "Unknown Speex mode %d", s->header->mode); - return -1; + return AVERROR_INVALIDDATA; } } else av_log(avctx, AV_LOG_INFO, "Missing Speex header, assuming defaults.\n"); if (avctx->channels > 2) { av_log(avctx, AV_LOG_ERROR, "Only stereo and mono are supported.\n"); - return -1; + return AVERROR(EINVAL); } speex_bits_init(&s->bits); @@ -128,7 +128,7 @@ static int libspeex_decode_frame(AVCodecContext *avctx, ret = speex_decode_int(s->dec_state, &s->bits, output); if (ret <= -2) { av_log(avctx, AV_LOG_ERROR, "Error decoding Speex frame.\n"); - return -1; + return AVERROR_INVALIDDATA; } if (avctx->channels == 2) speex_decode_stereo_int(output, s->frame_size, &s->stereo); From b19e0c2b4e74349d3b362e48c57eb233f1880b28 Mon Sep 17 00:00:00 2001 From: Justin Ruggles Date: Fri, 21 Oct 2011 12:13:28 -0400 Subject: [PATCH 34/35] libspeexdec: include system headers before local headers --- libavcodec/libspeexdec.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libavcodec/libspeexdec.c b/libavcodec/libspeexdec.c index f66331ea93..8bbae6c4f3 100644 --- a/libavcodec/libspeexdec.c +++ b/libavcodec/libspeexdec.c @@ -18,11 +18,11 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ -#include "avcodec.h" #include #include #include #include +#include "avcodec.h" typedef struct { SpeexBits bits; From f4b51d061f0f34e36be876b562b8abe47f4b9c1c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Reimar=20D=C3=B6ffinger?= Date: Wed, 19 Oct 2011 18:41:02 +0200 Subject: [PATCH 35/35] flvdec: Do not call parse_keyframes_index with a NULL stream MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Martin Storsjö --- libavformat/flvdec.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/libavformat/flvdec.c b/libavformat/flvdec.c index 395c8f8a57..1459850f4a 100644 --- a/libavformat/flvdec.c +++ b/libavformat/flvdec.c @@ -239,8 +239,9 @@ static int amf_parse_object(AVFormatContext *s, AVStream *astream, AVStream *vst case AMF_DATA_TYPE_OBJECT: { unsigned int keylen; - if (key && !strcmp(KEYFRAMES_TAG, key) && depth == 1) - if (parse_keyframes_index(s, ioc, vstream, max_pos) < 0) + if ((vstream || astream) && key && !strcmp(KEYFRAMES_TAG, key) && depth == 1) + if (parse_keyframes_index(s, ioc, vstream ? vstream : astream, + max_pos) < 0) return -1; while(avio_tell(ioc) < max_pos - 2 && (keylen = avio_rb16(ioc))) {