From c2d23309ef87c3f1a515860425a5a898aab9b760 Mon Sep 17 00:00:00 2001 From: Kostya Shishkov Date: Sat, 16 Jul 2011 12:02:55 +0200 Subject: [PATCH 1/4] add Flash Screen Video 2 decoder Signed-off-by: Ronald S. Bultje --- Changelog | 1 + configure | 1 + doc/general.texi | 1 + libavcodec/Makefile | 1 + libavcodec/allcodecs.c | 1 + libavcodec/flashsv.c | 307 +++++++++++++++++++++++++++++++++++++++-- libavcodec/version.h | 2 +- 7 files changed, 301 insertions(+), 13 deletions(-) diff --git a/Changelog b/Changelog index 782d8b3f04..7b8be5b302 100644 --- a/Changelog +++ b/Changelog @@ -4,6 +4,7 @@ releases are sorted from youngest to oldest. version : - BWF muxer +- Flash Screen Video 2 decoder version 0.7: diff --git a/configure b/configure index 3953cfc44b..a83d3396e1 100755 --- a/configure +++ b/configure @@ -1270,6 +1270,7 @@ flac_decoder_select="golomb" flac_encoder_select="golomb lpc" flashsv_decoder_select="zlib" flashsv_encoder_select="zlib" +flashsv2_decoder_select="zlib" flv_decoder_select="h263_decoder" flv_encoder_select="h263_encoder" fraps_decoder_select="huffman" diff --git a/doc/general.texi b/doc/general.texi index fe314481a3..c9919390d6 100644 --- a/doc/general.texi +++ b/doc/general.texi @@ -401,6 +401,7 @@ following image formats are supported: @tab experimental lossless codec (fourcc: FFV1) @item Flash Screen Video v1 @tab X @tab X @tab fourcc: FSV1 +@item Flash Screen Video v2 @tab @tab X @item Flash Video (FLV) @tab X @tab X @tab Sorenson H.263 used in Flash @item Fraps @tab @tab X diff --git a/libavcodec/Makefile b/libavcodec/Makefile index f23ade2734..99ecbbf567 100644 --- a/libavcodec/Makefile +++ b/libavcodec/Makefile @@ -146,6 +146,7 @@ OBJS-$(CONFIG_FLAC_DECODER) += flacdec.o flacdata.o flac.o OBJS-$(CONFIG_FLAC_ENCODER) += flacenc.o flacdata.o flac.o OBJS-$(CONFIG_FLASHSV_DECODER) += flashsv.o OBJS-$(CONFIG_FLASHSV_ENCODER) += flashsvenc.o +OBJS-$(CONFIG_FLASHSV2_DECODER) += flashsv.o OBJS-$(CONFIG_FLIC_DECODER) += flicvideo.o OBJS-$(CONFIG_FOURXM_DECODER) += 4xm.o OBJS-$(CONFIG_FRAPS_DECODER) += fraps.o diff --git a/libavcodec/allcodecs.c b/libavcodec/allcodecs.c index 71b6094089..dcef0d6d94 100644 --- a/libavcodec/allcodecs.c +++ b/libavcodec/allcodecs.c @@ -106,6 +106,7 @@ void avcodec_register_all(void) REGISTER_ENCDEC (FFV1, ffv1); REGISTER_ENCDEC (FFVHUFF, ffvhuff); REGISTER_ENCDEC (FLASHSV, flashsv); + REGISTER_DECODER (FLASHSV2, flashsv2); REGISTER_DECODER (FLIC, flic); REGISTER_ENCDEC (FLV, flv); REGISTER_DECODER (FOURXM, fourxm); diff --git a/libavcodec/flashsv.c b/libavcodec/flashsv.c index 51831eb801..57d33c06da 100644 --- a/libavcodec/flashsv.c +++ b/libavcodec/flashsv.c @@ -25,6 +25,8 @@ * Flash Screen Video decoder * @author Alex Beregszaszi * @author Benjamin Larsson + * @author Daniel Verkamp + * @author Konstantin Shishkov * * A description of the bitstream format for Flash Screen Video version 1/2 * is part of the SWF File Format Specification (version 10), which can be @@ -35,9 +37,17 @@ #include #include +#include "libavutil/intreadwrite.h" #include "avcodec.h" +#include "bytestream.h" #include "get_bits.h" +typedef struct BlockInfo { + uint8_t *pos; + int size; + int unp_size; +} BlockInfo; + typedef struct FlashSVContext { AVCodecContext *avctx; AVFrame frame; @@ -46,9 +56,50 @@ typedef struct FlashSVContext { uint8_t *tmpblock; int block_size; z_stream zstream; + int ver; + const uint32_t *pal; + int is_keyframe; + uint8_t *keyframedata; + uint8_t *keyframe; + BlockInfo *blocks; + uint8_t *deflate_block; + int deflate_block_size; + int color_depth; + int zlibprime_curr, zlibprime_prev; + int diff_start, diff_height; } FlashSVContext; +static int decode_hybrid(const uint8_t *sptr, uint8_t *dptr, int dx, int dy, + int h, int w, int stride, const uint32_t *pal) +{ + int x, y; + const uint8_t *orig_src = sptr; + + for (y = dx+h; y > dx; y--) { + uint8_t *dst = dptr + (y * stride) + dy * 3; + for (x = 0; x < w; x++) { + if (*sptr & 0x80) { + /* 15-bit color */ + unsigned c = AV_RB16(sptr) & ~0x8000; + unsigned b = c & 0x1F; + unsigned g = (c >> 5) & 0x1F; + unsigned r = c >> 10; + /* 000aaabb -> aaabbaaa */ + *dst++ = (b << 3) | (b >> 2); + *dst++ = (g << 3) | (g >> 2); + *dst++ = (r << 3) | (r >> 2); + sptr += 2; + } else { + /* palette index */ + uint32_t c = pal[*sptr++]; + bytestream_put_le24(&dst, c); + } + } + } + return sptr - orig_src; +} + static av_cold int flashsv_decode_init(AVCodecContext *avctx) { FlashSVContext *s = avctx->priv_data; @@ -70,9 +121,42 @@ static av_cold int flashsv_decode_init(AVCodecContext *avctx) } +static void flashsv2_prime(FlashSVContext *s, uint8_t *src, + int size, int unp_size) +{ + z_stream zs; + + zs.zalloc = NULL; + zs.zfree = NULL; + zs.opaque = NULL; + + s->zstream.next_in = src; + s->zstream.avail_in = size; + s->zstream.next_out = s->tmpblock; + s->zstream.avail_out = s->block_size * 3; + inflate(&s->zstream, Z_SYNC_FLUSH); + + deflateInit(&zs, 0); + zs.next_in = s->tmpblock; + zs.avail_in = s->block_size * 3 - s->zstream.avail_out; + zs.next_out = s->deflate_block; + zs.avail_out = s->deflate_block_size; + deflate(&zs, Z_SYNC_FLUSH); + deflateEnd(&zs); + + inflateReset(&s->zstream); + + s->zstream.next_in = s->deflate_block; + s->zstream.avail_in = s->deflate_block_size - zs.avail_out; + s->zstream.next_out = s->tmpblock; + s->zstream.avail_out = s->block_size * 3; + inflate(&s->zstream, Z_SYNC_FLUSH); +} + static int flashsv_decode_block(AVCodecContext *avctx, AVPacket *avpkt, GetBitContext *gb, int block_size, - int width, int height, int x_pos, int y_pos) + int width, int height, int x_pos, int y_pos, + int blk_idx) { struct FlashSVContext *s = avctx->priv_data; uint8_t *line = s->tmpblock; @@ -81,6 +165,10 @@ static int flashsv_decode_block(AVCodecContext *avctx, AVPacket *avpkt, if (ret != Z_OK) { //return -1; } + if (s->zlibprime_curr || s->zlibprime_prev) { + flashsv2_prime(s, s->blocks[blk_idx].pos, s->blocks[blk_idx].size, + s->blocks[blk_idx].unp_size); + } s->zstream.next_in = avpkt->data + get_bits_count(gb) / 8; s->zstream.avail_in = block_size; s->zstream.next_out = s->tmpblock; @@ -95,19 +183,48 @@ static int flashsv_decode_block(AVCodecContext *avctx, AVPacket *avpkt, if (ret != Z_OK && ret != Z_STREAM_END) { //return -1; } - /* Flash Screen Video stores the image upside down, so copy - * lines to destination in reverse order. */ - for (k = 1; k <= height; k++) { - memcpy(s->frame.data[0] + x_pos * 3 + - (s->image_height - y_pos - k) * s->frame.linesize[0], - line, width * 3); - /* advance source pointer to next line */ - line += width * 3; + + if (s->is_keyframe) { + s->blocks[blk_idx].pos = s->keyframedata + (get_bits_count(gb) / 8); + s->blocks[blk_idx].size = block_size; + s->blocks[blk_idx].unp_size = s->block_size * 3 - s->zstream.avail_out; + } + if (!s->color_depth) { + /* Flash Screen Video stores the image upside down, so copy + * lines to destination in reverse order. */ + for (k = 1; k <= s->diff_height; k++) { + memcpy(s->frame.data[0] + x_pos * 3 + + (s->image_height - y_pos - s->diff_start - k) * s->frame.linesize[0], + line, width * 3); + /* advance source pointer to next line */ + line += width * 3; + } + } else { + /* hybrid 15-bit/palette mode */ + decode_hybrid(s->tmpblock, s->frame.data[0], + s->image_height - (y_pos + 1 + s->diff_start + s->diff_height), + x_pos, s->diff_height, width, + s->frame.linesize[0], s->pal); } skip_bits_long(gb, 8 * block_size); /* skip the consumed bits */ return 0; } +static int calc_deflate_block_size(int tmpblock_size) +{ + z_stream zstream; + int size; + + zstream.zalloc = Z_NULL; + zstream.zfree = Z_NULL; + zstream.opaque = Z_NULL; + if (deflateInit(&zstream, 0) != Z_OK) + return -1; + size = deflateBound(&zstream, tmpblock_size); + deflateEnd(&zstream); + + return size; +} static int flashsv_decode_frame(AVCodecContext *avctx, void *data, int *data_size, AVPacket *avpkt) @@ -131,6 +248,18 @@ static int flashsv_decode_frame(AVCodecContext *avctx, void *data, s->block_height = 16 * (get_bits(&gb, 4) + 1); s->image_height = get_bits(&gb, 12); + if (s->ver == 2) { + skip_bits(&gb, 6); + if (get_bits1(&gb)) { + av_log_missing_feature(avctx, "iframe", 1); + return AVERROR_PATCHWELCOME; + } + if (get_bits1(&gb)) { + av_log_missing_feature(avctx, "custom palette", 1); + return AVERROR_PATCHWELCOME; + } + } + /* calculate number of blocks and size of border (partial) blocks */ h_blocks = s->image_width / s->block_width; h_part = s->image_width % s->block_width; @@ -140,11 +269,25 @@ static int flashsv_decode_frame(AVCodecContext *avctx, void *data, /* the block size could change between frames, make sure the buffer * is large enough, if not, get a larger one */ if (s->block_size < s->block_width * s->block_height) { - av_free(s->tmpblock); - if ((s->tmpblock = av_malloc(3 * s->block_width * s->block_height)) == NULL) { + int tmpblock_size = 3 * s->block_width * s->block_height; + + s->tmpblock = av_realloc(s->tmpblock, tmpblock_size); + if (!s->tmpblock) { av_log(avctx, AV_LOG_ERROR, "Can't allocate decompression buffer.\n"); return AVERROR(ENOMEM); } + if (s->ver == 2) { + s->deflate_block_size = calc_deflate_block_size(tmpblock_size); + if (s->deflate_block_size <= 0) { + av_log(avctx, AV_LOG_ERROR, "Can't determine deflate buffer size.\n"); + return -1; + } + s->deflate_block = av_realloc(s->deflate_block, s->deflate_block_size); + if (!s->deflate_block) { + av_log(avctx, AV_LOG_ERROR, "Can't allocate deflate buffer.\n"); + return AVERROR(ENOMEM); + } + } } s->block_size = s->block_width * s->block_height; @@ -163,6 +306,16 @@ static int flashsv_decode_frame(AVCodecContext *avctx, void *data, return AVERROR_INVALIDDATA; } + /* we care for keyframes only in Screen Video v2 */ + s->is_keyframe = (avpkt->flags & AV_PKT_FLAG_KEY) && (s->ver == 2); + if (s->is_keyframe) { + s->keyframedata = av_realloc(s->keyframedata, avpkt->size); + memcpy(s->keyframedata, avpkt->data, avpkt->size); + s->blocks = av_realloc(s->blocks, + (v_blocks + !!v_part) * (h_blocks + !!h_part) + * sizeof(s->blocks[0])); + } + av_dlog(avctx, "image: %dx%d block: %dx%d num: %dx%d part: %dx%d\n", s->image_width, s->image_height, s->block_width, s->block_height, h_blocks, v_blocks, h_part, v_part); @@ -186,25 +339,90 @@ static int flashsv_decode_frame(AVCodecContext *avctx, void *data, for (i = 0; i < h_blocks + (h_part ? 1 : 0); i++) { int x_pos = i * s->block_width; // horizontal position in frame int cur_blk_width = (i < h_blocks) ? s->block_width : h_part; + int has_diff = 0; /* get the size of the compressed zlib chunk */ int size = get_bits(&gb, 16); + + s->color_depth = 0; + s->zlibprime_curr = 0; + s->zlibprime_prev = 0; + s->diff_start = 0; + s->diff_height = cur_blk_height; + if (8 * size > get_bits_left(&gb)) { avctx->release_buffer(avctx, &s->frame); s->frame.data[0] = NULL; return AVERROR_INVALIDDATA; } + if (s->ver == 2 && size) { + skip_bits(&gb, 3); + s->color_depth = get_bits(&gb, 2); + has_diff = get_bits1(&gb); + s->zlibprime_curr = get_bits1(&gb); + s->zlibprime_prev = get_bits1(&gb); + + if (s->color_depth != 0 && s->color_depth != 2) { + av_log(avctx, AV_LOG_ERROR, + "%dx%d invalid color depth %d\n", i, j, s->color_depth); + return -1; + } + + if (has_diff) { + s->diff_start = get_bits(&gb, 8); + s->diff_height = get_bits(&gb, 8); + av_log(avctx, AV_LOG_DEBUG, + "%dx%d diff start %d height %d\n", + i, j, s->diff_start, s->diff_height); + size -= 2; + } + + if (s->zlibprime_prev) + av_log(avctx, AV_LOG_DEBUG, "%dx%d zlibprime_prev\n", i, j); + + if (s->zlibprime_curr) { + int col = get_bits(&gb, 8); + int row = get_bits(&gb, 8); + av_log(avctx, AV_LOG_DEBUG, "%dx%d zlibprime_curr %dx%d\n", i, j, col, row); + size -= 2; + av_log_missing_feature(avctx, "zlibprime_curr", 1); + return AVERROR_PATCHWELCOME; + } + size--; // account for flags byte + } + + if (has_diff) { + int k; + int off = (s->image_height - y_pos - 1) * s->frame.linesize[0]; + + for (k = 0; k < cur_blk_height; k++) + memcpy(s->frame.data[0] + off - k*s->frame.linesize[0] + x_pos*3, + s->keyframe + off - k*s->frame.linesize[0] + x_pos*3, + cur_blk_width * 3); + } + /* skip unchanged blocks, which have size 0 */ if (size) { if (flashsv_decode_block(avctx, avpkt, &gb, size, cur_blk_width, cur_blk_height, - x_pos, y_pos)) + x_pos, y_pos, + i + j * (h_blocks + !!h_part))) av_log(avctx, AV_LOG_ERROR, "error in decompression of block %dx%d\n", i, j); } } } + if (s->is_keyframe && s->ver == 2) { + if (!s->keyframe) { + s->keyframe = av_malloc(s->frame.linesize[0] * avctx->height); + if (!s->keyframe) { + av_log(avctx, AV_LOG_ERROR, "Cannot allocate image data\n"); + return AVERROR(ENOMEM); + } + } + memcpy(s->keyframe, s->frame.data[0], s->frame.linesize[0] * avctx->height); + } *data_size = sizeof(AVFrame); *(AVFrame*)data = s->frame; @@ -233,6 +451,7 @@ static av_cold int flashsv_decode_end(AVCodecContext *avctx) } +#if CONFIG_FLASHSV_DECODER AVCodec ff_flashsv_decoder = { .name = "flashsv", .type = AVMEDIA_TYPE_VIDEO, @@ -245,3 +464,67 @@ AVCodec ff_flashsv_decoder = { .pix_fmts = (const enum PixelFormat[]){PIX_FMT_BGR24, PIX_FMT_NONE}, .long_name = NULL_IF_CONFIG_SMALL("Flash Screen Video v1"), }; +#endif /* CONFIG_FLASHSV_DECODER */ + +#if CONFIG_FLASHSV2_DECODER +static const uint32_t ff_flashsv2_default_palette[128] = { + 0x000000, 0x333333, 0x666666, 0x999999, 0xCCCCCC, 0xFFFFFF, + 0x330000, 0x660000, 0x990000, 0xCC0000, 0xFF0000, 0x003300, + 0x006600, 0x009900, 0x00CC00, 0x00FF00, 0x000033, 0x000066, + 0x000099, 0x0000CC, 0x0000FF, 0x333300, 0x666600, 0x999900, + 0xCCCC00, 0xFFFF00, 0x003333, 0x006666, 0x009999, 0x00CCCC, + 0x00FFFF, 0x330033, 0x660066, 0x990099, 0xCC00CC, 0xFF00FF, + 0xFFFF33, 0xFFFF66, 0xFFFF99, 0xFFFFCC, 0xFF33FF, 0xFF66FF, + 0xFF99FF, 0xFFCCFF, 0x33FFFF, 0x66FFFF, 0x99FFFF, 0xCCFFFF, + 0xCCCC33, 0xCCCC66, 0xCCCC99, 0xCCCCFF, 0xCC33CC, 0xCC66CC, + 0xCC99CC, 0xCCFFCC, 0x33CCCC, 0x66CCCC, 0x99CCCC, 0xFFCCCC, + 0x999933, 0x999966, 0x9999CC, 0x9999FF, 0x993399, 0x996699, + 0x99CC99, 0x99FF99, 0x339999, 0x669999, 0xCC9999, 0xFF9999, + 0x666633, 0x666699, 0x6666CC, 0x6666FF, 0x663366, 0x669966, + 0x66CC66, 0x66FF66, 0x336666, 0x996666, 0xCC6666, 0xFF6666, + 0x333366, 0x333399, 0x3333CC, 0x3333FF, 0x336633, 0x339933, + 0x33CC33, 0x33FF33, 0x663333, 0x993333, 0xCC3333, 0xFF3333, + 0x003366, 0x336600, 0x660033, 0x006633, 0x330066, 0x663300, + 0x336699, 0x669933, 0x993366, 0x339966, 0x663399, 0x996633, + 0x6699CC, 0x99CC66, 0xCC6699, 0x66CC99, 0x9966CC, 0xCC9966, + 0x99CCFF, 0xCCFF99, 0xFF99CC, 0x99FFCC, 0xCC99FF, 0xFFCC99, + 0x111111, 0x222222, 0x444444, 0x555555, 0xAAAAAA, 0xBBBBBB, + 0xDDDDDD, 0xEEEEEE +}; + +static av_cold int flashsv2_decode_init(AVCodecContext *avctx) +{ + FlashSVContext *s = avctx->priv_data; + flashsv_decode_init(avctx); + s->pal = ff_flashsv2_default_palette; + s->ver = 2; + + return 0; +} + +static av_cold int flashsv2_decode_end(AVCodecContext *avctx) +{ + FlashSVContext *s = avctx->priv_data; + + av_freep(&s->keyframedata); + av_freep(&s->blocks); + av_freep(&s->keyframe); + av_freep(&s->deflate_block); + flashsv_decode_end(avctx); + + return 0; +} + +AVCodec ff_flashsv2_decoder = { + .name = "flashsv2", + .type = AVMEDIA_TYPE_VIDEO, + .id = CODEC_ID_FLASHSV2, + .priv_data_size = sizeof(FlashSVContext), + .init = flashsv2_decode_init, + .close = flashsv2_decode_end, + .decode = flashsv_decode_frame, + .capabilities = CODEC_CAP_DR1, + .pix_fmts = (const enum PixelFormat[]){PIX_FMT_BGR24, PIX_FMT_NONE}, + .long_name = NULL_IF_CONFIG_SMALL("Flash Screen Video v2"), +}; +#endif /* CONFIG_FLASHSV2_DECODER */ diff --git a/libavcodec/version.h b/libavcodec/version.h index a4e0402cad..8d40899557 100644 --- a/libavcodec/version.h +++ b/libavcodec/version.h @@ -21,7 +21,7 @@ #define AVCODEC_VERSION_H #define LIBAVCODEC_VERSION_MAJOR 53 -#define LIBAVCODEC_VERSION_MINOR 6 +#define LIBAVCODEC_VERSION_MINOR 7 #define LIBAVCODEC_VERSION_MICRO 0 #define LIBAVCODEC_VERSION_INT AV_VERSION_INT(LIBAVCODEC_VERSION_MAJOR, \ From 505345ed5d180093a44da8d70ac541898c31c22f Mon Sep 17 00:00:00 2001 From: Alex Converse Date: Fri, 22 Jul 2011 10:13:22 -0700 Subject: [PATCH 2/4] riff: Add mpgv MPEG-2 fourcc Supported by mplayer and seen in the wild. --- libavformat/riff.c | 1 + 1 file changed, 1 insertion(+) diff --git a/libavformat/riff.c b/libavformat/riff.c index 27f45b91d9..c426ae5de6 100644 --- a/libavformat/riff.c +++ b/libavformat/riff.c @@ -131,6 +131,7 @@ const AVCodecTag ff_codec_bmp_tags[] = { { CODEC_ID_MPEG2VIDEO, MKTAG('s', 'l', 'i', 'f') }, { CODEC_ID_MPEG2VIDEO, MKTAG('E', 'M', '2', 'V') }, { CODEC_ID_MPEG2VIDEO, MKTAG('M', '7', '0', '1') }, /* Matrox MPEG2 intra-only */ + { CODEC_ID_MPEG2VIDEO, MKTAG('m', 'p', 'g', 'v') }, { CODEC_ID_MJPEG, MKTAG('M', 'J', 'P', 'G') }, { CODEC_ID_MJPEG, MKTAG('L', 'J', 'P', 'G') }, { CODEC_ID_MJPEG, MKTAG('d', 'm', 'b', '1') }, From 406fbd24dc62db4853cb24b24f40caf3e70ee2e8 Mon Sep 17 00:00:00 2001 From: Daniel Kang Date: Thu, 21 Jul 2011 21:15:58 -0400 Subject: [PATCH 3/4] H.264: Add optimizations to predict x86 assembly. Signed-off-by: Ronald S. Bultje --- libavcodec/x86/h264_intrapred.asm | 5 +- libavcodec/x86/h264_intrapred_10bit.asm | 1117 +++++++++-------------- libavcodec/x86/h264_intrapred_init.c | 29 +- 3 files changed, 437 insertions(+), 714 deletions(-) diff --git a/libavcodec/x86/h264_intrapred.asm b/libavcodec/x86/h264_intrapred.asm index cbf3cf7a5c..c1cd5c4d25 100644 --- a/libavcodec/x86/h264_intrapred.asm +++ b/libavcodec/x86/h264_intrapred.asm @@ -2611,12 +2611,11 @@ cglobal pred4x4_down_left_mmxext, 3,3 punpckldq m1, [r1] movq m2, m1 movq m3, m1 - movq m4, m1 psllq m1, 8 pxor m2, m1 psrlq m2, 8 - pxor m3, m2 - PRED4x4_LOWPASS m0, m1, m3, m4, m5 + pxor m2, m3 + PRED4x4_LOWPASS m0, m1, m2, m3, m4 lea r1, [r0+r2*2] psrlq m0, 8 movd [r0+r2*1], m0 diff --git a/libavcodec/x86/h264_intrapred_10bit.asm b/libavcodec/x86/h264_intrapred_10bit.asm index 24a7bfa875..e14e31a38c 100644 --- a/libavcodec/x86/h264_intrapred_10bit.asm +++ b/libavcodec/x86/h264_intrapred_10bit.asm @@ -27,8 +27,6 @@ SECTION_RODATA -SECTION .text - cextern pw_16 cextern pw_8 cextern pw_4 @@ -42,6 +40,8 @@ pw_512: times 8 dw 512 pd_17: times 4 dd 17 pd_16: times 4 dd 16 +SECTION .text + ; dest, left, right, src ; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2 %macro PRED4x4_LOWPASS 4 @@ -64,13 +64,11 @@ cglobal pred4x4_down_right_10_%1, 3,3 movq m3, [r0] punpckhdq m1, m2 PALIGNR m3, m1, 10, m1 - mova m1, m3 movhps m4, [r1+r2*1-8] - PALIGNR m3, m4, 14, m4 - mova m2, m3 + PALIGNR m0, m3, m4, 14, m4 movhps m4, [r1+r2*2-8] - PALIGNR m3, m4, 14, m4 - PRED4x4_LOWPASS m0, m3, m1, m2 + PALIGNR m2, m0, m4, 14, m4 + PRED4x4_LOWPASS m0, m2, m3, m0 movq [r1+r2*2], m0 psrldq m0, 2 movq [r1+r2*1], m0 @@ -104,22 +102,20 @@ cglobal pred4x4_vertical_right_10_%1, 3,3,6 pavgw m5, m0 movhps m1, [r0+r2*1-8] PALIGNR m0, m1, 14, m1 ; ....t3t2t1t0ltl0 - mova m1, m0 movhps m2, [r0+r2*2-8] - PALIGNR m0, m2, 14, m2 ; ..t3t2t1t0ltl0l1 - mova m2, m0 + PALIGNR m1, m0, m2, 14, m2 ; ..t3t2t1t0ltl0l1 movhps m3, [r1+r2*1-8] - PALIGNR m0, m3, 14, m3 ; t3t2t1t0ltl0l1l2 - PRED4x4_LOWPASS m3, m1, m0, m2 - pslldq m1, m3, 12 - psrldq m3, 4 + PALIGNR m2, m1, m3, 14, m3 ; t3t2t1t0ltl0l1l2 + PRED4x4_LOWPASS m1, m0, m2, m1 + pslldq m0, m1, 12 + psrldq m1, 4 movq [r0+r2*1], m5 - movq [r0+r2*2], m3 - PALIGNR m5, m1, 14, m2 - pslldq m1, 2 + movq [r0+r2*2], m1 + PALIGNR m5, m0, 14, m2 + pslldq m0, 2 movq [r1+r2*1], m5 - PALIGNR m3, m1, 14, m1 - movq [r1+r2*2], m3 + PALIGNR m1, m0, 14, m0 + movq [r1+r2*2], m1 RET %endmacro @@ -152,9 +148,9 @@ cglobal pred4x4_horizontal_down_10_%1, 3,3 punpckhdq m1, m2 ; l0 l1 l2 l3 punpckhqdq m1, m0 ; t2 t1 t0 lt l0 l1 l2 l3 psrldq m0, m1, 4 ; .. .. t2 t1 t0 lt l0 l1 - psrldq m2, m1, 2 ; .. t2 t1 t0 lt l0 l1 l2 - pavgw m5, m1, m2 - PRED4x4_LOWPASS m3, m1, m0, m2 + psrldq m3, m1, 2 ; .. t2 t1 t0 lt l0 l1 l2 + pavgw m5, m1, m3 + PRED4x4_LOWPASS m3, m1, m0, m3 punpcklwd m5, m3 psrldq m3, 8 PALIGNR m3, m5, 12, m4 @@ -220,17 +216,15 @@ cglobal pred4x4_dc_10_mmxext, 3,3 ;----------------------------------------------------------------------------- ; void pred4x4_down_left(pixel *src, const pixel *topright, int stride) ;----------------------------------------------------------------------------- -;TODO: more AVX here %macro PRED4x4_DL 1 cglobal pred4x4_down_left_10_%1, 3,3 sub r0, r2 - movq m1, [r0] - movhps m1, [r1] - pslldq m5, m1, 2 - pxor m2, m5, m1 - psrldq m2, 2 - pxor m3, m1, m2 - PRED4x4_LOWPASS m0, m5, m3, m1 + movq m0, [r0] + movhps m0, [r1] + psrldq m2, m0, 2 + pslldq m3, m0, 2 + pshufhw m2, m2, 10100100b + PRED4x4_LOWPASS m0, m3, m2, m0 lea r1, [r0+r2*2] movhps [r1+r2*2], m0 psrldq m0, 2 @@ -257,10 +251,10 @@ cglobal pred4x4_vertical_left_10_%1, 3,3 sub r0, r2 movu m1, [r0] movhps m1, [r1] - psrldq m3, m1, 2 + psrldq m0, m1, 2 psrldq m2, m1, 4 - pavgw m4, m3, m1 - PRED4x4_LOWPASS m0, m1, m2, m3 + pavgw m4, m0, m1 + PRED4x4_LOWPASS m0, m1, m2, m0 lea r1, [r0+r2*2] movq [r0+r2*1], m4 movq [r0+r2*2], m0 @@ -298,13 +292,13 @@ cglobal pred4x4_horizontal_up_10_mmxext, 3,3 pavgw m2, m0 pshufw m5, m0, 11111110b - PRED4x4_LOWPASS m3, m0, m5, m1 + PRED4x4_LOWPASS m1, m0, m5, m1 movq m6, m2 - punpcklwd m6, m3 + punpcklwd m6, m1 movq [r0+r2*1], m6 psrlq m2, 16 - psrlq m3, 16 - punpcklwd m2, m3 + psrlq m1, 16 + punpcklwd m2, m1 movq [r0+r2*2], m2 psrlq m2, 32 movd [r1+r2*1], m2 @@ -333,7 +327,7 @@ cglobal pred8x8_vertical_10_sse2, 2,2 ;----------------------------------------------------------------------------- INIT_XMM cglobal pred8x8_horizontal_10_sse2, 2,3 - mov r2, 4 + mov r2d, 4 .loop: movq m0, [r0+r1*0-8] movq m1, [r0+r1*1-8] @@ -344,7 +338,7 @@ cglobal pred8x8_horizontal_10_sse2, 2,3 mova [r0+r1*0], m0 mova [r0+r1*1], m1 lea r0, [r0+r1*2] - dec r2 + dec r2d jg .loop REP_RET @@ -362,53 +356,53 @@ cglobal pred8x8_horizontal_10_sse2, 2,3 %endmacro %macro PRED8x8_DC 2 -cglobal pred8x8_dc_10_%1, 2,4 -%ifdef ARCH_X86_64 -%define t0 r10 -%else -%define t0 r0m -%endif +cglobal pred8x8_dc_10_%1, 2,6 sub r0, r1 pxor m4, m4 movq m0, [r0+0] movq m1, [r0+8] - HADDW m0, m2 - mov t0, r0 - HADDW m1, m2 +%if mmsize==16 + punpcklwd m0, m1 + movhlps m1, m0 + paddw m0, m1 +%else + pshufw m2, m0, 00001110b + pshufw m3, m1, 00001110b + paddw m0, m2 + paddw m1, m3 + punpcklwd m0, m1 +%endif + %2 m2, m0, 00001110b + paddw m0, m2 + lea r5, [r1*3] + lea r4, [r0+r1*4] movzx r2d, word [r0+r1*1-2] movzx r3d, word [r0+r1*2-2] - lea r0, [r0+r1*2] add r2d, r3d - movzx r3d, word [r0+r1*1-2] + movzx r3d, word [r0+r5*1-2] add r2d, r3d - movzx r3d, word [r0+r1*2-2] + movzx r3d, word [r4-2] add r2d, r3d - lea r0, [r0+r1*2] movd m2, r2d ; s2 - movzx r2d, word [r0+r1*1-2] - movzx r3d, word [r0+r1*2-2] - lea r0, [r0+r1*2] + movzx r2d, word [r4+r1*1-2] + movzx r3d, word [r4+r1*2-2] add r2d, r3d - movzx r3d, word [r0+r1*1-2] + movzx r3d, word [r4+r5*1-2] add r2d, r3d - movzx r3d, word [r0+r1*2-2] + movzx r3d, word [r4+r1*4-2] add r2d, r3d movd m3, r2d ; s3 - punpcklwd m0, m1 - mov r0, t0 punpcklwd m2, m3 punpckldq m0, m2 ; s0, s1, s2, s3 %2 m3, m0, 11110110b ; s2, s1, s3, s3 - lea r2, [r1+r1*2] %2 m0, m0, 01110100b ; s0, s1, s3, s1 paddw m0, m3 - lea r3, [r0+r1*4] psrlw m0, 2 pavgw m0, m4 ; s0+s2, s1, s3, s1+s3 -%ifidn %1, sse2 +%if mmsize==16 punpcklwd m0, m0 pshufd m3, m0, 11111010b punpckldq m0, m0 @@ -421,12 +415,12 @@ cglobal pred8x8_dc_10_%1, 2,4 %endif MOV8 r0+r1*1, m1, m2 MOV8 r0+r1*2, m1, m2 - MOV8 r0+r2*1, m1, m2 + MOV8 r0+r5*1, m1, m2 MOV8 r0+r1*4, m1, m2 - MOV8 r3+r1*1, m3, m4 - MOV8 r3+r1*2, m3, m4 - MOV8 r3+r2*1, m3, m4 - MOV8 r3+r1*4, m3, m4 + MOV8 r4+r1*1, m3, m4 + MOV8 r4+r1*2, m3, m4 + MOV8 r4+r5*1, m3, m4 + MOV8 r4+r1*4, m3, m4 RET %endmacro @@ -438,39 +432,29 @@ PRED8x8_DC sse2 , pshuflw ;----------------------------------------------------------------------------- ; void pred8x8_top_dc(pixel *src, int stride) ;----------------------------------------------------------------------------- -%macro PRED8x8_TOP_DC 2 -cglobal pred8x8_top_dc_10_%1, 2,4 +INIT_XMM +cglobal pred8x8_top_dc_10_sse2, 2,4 sub r0, r1 - movq m0, [r0+0] - movq m1, [r0+8] - HADDW m0, m2 - HADDW m1, m3 - lea r2, [r1+r1*2] - paddw m0, [pw_2] - paddw m1, [pw_2] + mova m0, [r0] + pshuflw m1, m0, 0x4e + pshufhw m1, m1, 0x4e + paddw m0, m1 + pshuflw m1, m0, 0xb1 + pshufhw m1, m1, 0xb1 + paddw m0, m1 + lea r2, [r1*3] lea r3, [r0+r1*4] + paddw m0, [pw_2] psrlw m0, 2 - psrlw m1, 2 - %2 m0, m0, 0 - %2 m1, m1, 0 -%ifidn %1, sse2 - punpcklqdq m0, m1 -%endif - MOV8 r0+r1*1, m0, m1 - MOV8 r0+r1*2, m0, m1 - MOV8 r0+r2*1, m0, m1 - MOV8 r0+r1*4, m0, m1 - MOV8 r3+r1*1, m0, m1 - MOV8 r3+r1*2, m0, m1 - MOV8 r3+r2*1, m0, m1 - MOV8 r3+r1*4, m0, m1 + mova [r0+r1*1], m0 + mova [r0+r1*2], m0 + mova [r0+r2*1], m0 + mova [r0+r1*4], m0 + mova [r3+r1*1], m0 + mova [r3+r1*2], m0 + mova [r3+r2*1], m0 + mova [r3+r1*4], m0 RET -%endmacro - -INIT_MMX -PRED8x8_TOP_DC mmxext, pshufw -INIT_XMM -PRED8x8_TOP_DC sse2 , pshuflw ;----------------------------------------------------------------------------- ; void pred8x8_plane(pixel *src, int stride) @@ -478,7 +462,7 @@ PRED8x8_TOP_DC sse2 , pshuflw INIT_XMM cglobal pred8x8_plane_10_sse2, 2,7,7 sub r0, r1 - lea r2, [r1+r1*2] + lea r2, [r1*3] lea r3, [r0+r1*4] mova m2, [r0] pmaddwd m2, [pw_m32101234] @@ -500,7 +484,7 @@ cglobal pred8x8_plane_10_sse2, 2,7,7 movzx r5d, word [r3+r2*1-2] ; src[6*stride-1] movzx r6d, word [r0+r1*1-2] ; src[0*stride-1] sub r5d, r6d - lea r5d, [r5+r5*2] + lea r5d, [r5*3] add r4d, r5d movzx r6d, word [r3+r1*4-2] ; src[7*stride-1] movzx r5d, word [r0+r1*0-2] ; src[ -stride-1] @@ -540,8 +524,8 @@ cglobal pred8x8_plane_10_sse2, 2,7,7 ;----------------------------------------------------------------------------- %macro PRED8x8L_128_DC 1 cglobal pred8x8l_128_dc_10_%1, 4,4 - mova m0, [pw_512] - lea r1, [r3+r3*2] + mova m0, [pw_512] ; (1<<(BIT_DEPTH-1)) + lea r1, [r3*3] lea r2, [r0+r3*4] MOV8 r0+r3*0, m0, m0 MOV8 r0+r3*1, m0, m0 @@ -565,37 +549,17 @@ PRED8x8L_128_DC sse2 %macro PRED8x8L_TOP_DC 1 cglobal pred8x8l_top_dc_10_%1, 4,4,6 sub r0, r3 - pxor m7, m7 - mova m0, [r0-16] - mova m3, [r0] - mova m1, [r0+16] - mova m2, m3 - mova m4, m3 - PALIGNR m2, m0, 14, m0 - PALIGNR m1, m4, 2, m4 - test r1, r1 ; top_left - jz .fix_lt_2 - test r2, r2 ; top_right - jz .fix_tr_1 - jmp .body -.fix_lt_2: - mova m5, m3 - pxor m5, m2 - pslldq m5, 14 - psrldq m5, 14 - pxor m2, m5 - test r2, r2 ; top_right - jnz .body -.fix_tr_1: - mova m5, m3 - pxor m5, m1 - psrldq m5, 14 - pslldq m5, 14 - pxor m1, m5 -.body - lea r1, [r3+r3*2] + mova m0, [r0] + shr r1d, 14 + shr r2d, 13 + neg r1 + pslldq m1, m0, 2 + psrldq m2, m0, 2 + pinsrw m1, [r0+r1], 0 + pinsrw m2, [r0+r2+14], 7 + lea r1, [r3*3] lea r2, [r0+r3*4] - PRED4x4_LOWPASS m0, m2, m1, m3 + PRED4x4_LOWPASS m0, m2, m1, m0 HADDW m0, m1 paddw m0, [pw_4] psrlw m0, 3 @@ -612,110 +576,70 @@ cglobal pred8x8l_top_dc_10_%1, 4,4,6 %endmacro INIT_XMM -%define PALIGNR PALIGNR_MMX PRED8x8L_TOP_DC sse2 -%define PALIGNR PALIGNR_SSSE3 -PRED8x8L_TOP_DC ssse3 +%ifdef HAVE_AVX +INIT_AVX +PRED8x8L_TOP_DC avx +%endif ;----------------------------------------------------------------------------- ;void pred8x8l_dc(pixel *src, int has_topleft, int has_topright, int stride) ;----------------------------------------------------------------------------- ;TODO: see if scalar is faster %macro PRED8x8L_DC 1 -cglobal pred8x8l_dc_10_%1, 4,5,8 +cglobal pred8x8l_dc_10_%1, 4,6,6 sub r0, r3 - lea r4, [r0+r3*2] - mova m0, [r0+r3*1-16] - punpckhwd m0, [r0+r3*0-16] - mova m1, [r4+r3*1-16] - punpckhwd m1, [r0+r3*2-16] - mov r4, r0 + lea r4, [r0+r3*4] + lea r5, [r3*3] + mova m0, [r0+r3*2-16] + punpckhwd m0, [r0+r3*1-16] + mova m1, [r4+r3*0-16] + punpckhwd m1, [r0+r5*1-16] punpckhdq m1, m0 - lea r0, [r0+r3*4] - mova m2, [r0+r3*1-16] - punpckhwd m2, [r0+r3*0-16] - lea r0, [r0+r3*2] - mova m3, [r0+r3*1-16] - punpckhwd m3, [r0+r3*0-16] + mova m2, [r4+r3*2-16] + punpckhwd m2, [r4+r3*1-16] + mova m3, [r4+r3*4-16] + punpckhwd m3, [r4+r5*1-16] punpckhdq m3, m2 punpckhqdq m3, m1 - lea r0, [r0+r3*2] - mova m0, [r0+r3*0-16] - mova m1, [r4] - mov r0, r4 - mova m4, m3 - mova m2, m3 - PALIGNR m4, m0, 14, m0 - PALIGNR m1, m2, 2, m2 - test r1, r1 - jnz .do_left -.fix_lt_1: - mova m5, m3 - pxor m5, m4 - psrldq m5, 14 - pslldq m5, 12 - pxor m1, m5 - jmp .do_left -.fix_lt_2: - mova m5, m3 - pxor m5, m2 - pslldq m5, 14 - psrldq m5, 14 - pxor m2, m5 - test r2, r2 - jnz .body -.fix_tr_1: - mova m5, m3 - pxor m5, m1 - psrldq m5, 14 - pslldq m5, 14 - pxor m1, m5 - jmp .body -.do_left: - mova m0, m4 - PRED4x4_LOWPASS m2, m1, m4, m3 - mova m4, m0 - mova m7, m2 - PRED4x4_LOWPASS m1, m3, m0, m4 - pslldq m1, 14 - PALIGNR m7, m1, 14, m3 - mova m0, [r0-16] - mova m3, [r0] - mova m1, [r0+16] - mova m2, m3 - mova m4, m3 - PALIGNR m2, m0, 14, m0 - PALIGNR m1, m4, 2, m4 - test r1, r1 - jz .fix_lt_2 - test r2, r2 - jz .fix_tr_1 -.body - lea r1, [r3+r3*2] - PRED4x4_LOWPASS m6, m2, m1, m3 - HADDW m7, m0 - HADDW m6, m0 - lea r2, [r0+r3*4] - paddw m7, [pw_8] - paddw m7, m6 - psrlw m7, 4 - SPLATW m7, m7 - mova [r0+r3*1], m7 - mova [r0+r3*2], m7 - mova [r0+r1*1], m7 - mova [r0+r3*4], m7 - mova [r2+r3*1], m7 - mova [r2+r3*2], m7 - mova [r2+r1*1], m7 - mova [r2+r3*4], m7 + mova m0, [r0] + shr r1d, 14 + shr r2d, 13 + neg r1 + pslldq m1, m0, 2 + psrldq m2, m0, 2 + pinsrw m1, [r0+r1], 0 + pinsrw m2, [r0+r2+14], 7 + not r1 + and r1, r3 + pslldq m4, m3, 2 + psrldq m5, m3, 2 + pshuflw m4, m4, 11100101b + pinsrw m5, [r0+r1-2], 7 + PRED4x4_LOWPASS m3, m4, m5, m3 + PRED4x4_LOWPASS m0, m2, m1, m0 + paddw m0, m3 + HADDW m0, m1 + paddw m0, [pw_8] + psrlw m0, 4 + SPLATW m0, m0 + mova [r0+r3*1], m0 + mova [r0+r3*2], m0 + mova [r0+r5*1], m0 + mova [r0+r3*4], m0 + mova [r4+r3*1], m0 + mova [r4+r3*2], m0 + mova [r4+r5*1], m0 + mova [r4+r3*4], m0 RET %endmacro INIT_XMM -%define PALIGNR PALIGNR_MMX PRED8x8L_DC sse2 -%define PALIGNR PALIGNR_SSSE3 -PRED8x8L_DC ssse3 +%ifdef HAVE_AVX +INIT_AVX +PRED8x8L_DC avx +%endif ;----------------------------------------------------------------------------- ; void pred8x8l_vertical(pixel *src, int has_topleft, int has_topright, int stride) @@ -723,36 +647,17 @@ PRED8x8L_DC ssse3 %macro PRED8x8L_VERTICAL 1 cglobal pred8x8l_vertical_10_%1, 4,4,6 sub r0, r3 - mova m0, [r0-16] - mova m3, [r0] - mova m1, [r0+16] - mova m2, m3 - mova m4, m3 - PALIGNR m2, m0, 14, m0 - PALIGNR m1, m4, 2, m4 - test r1, r1 ; top_left - jz .fix_lt_2 - test r2, r2 ; top_right - jz .fix_tr_1 - jmp .body -.fix_lt_2: - mova m5, m3 - pxor m5, m2 - pslldq m5, 14 - psrldq m5, 14 - pxor m2, m5 - test r2, r2 ; top_right - jnz .body -.fix_tr_1: - mova m5, m3 - pxor m5, m1 - psrldq m5, 14 - pslldq m5, 14 - pxor m1, m5 -.body - lea r1, [r3+r3*2] + mova m0, [r0] + shr r1d, 14 + shr r2d, 13 + neg r1 + pslldq m1, m0, 2 + psrldq m2, m0, 2 + pinsrw m1, [r0+r1], 0 + pinsrw m2, [r0+r2+14], 7 + lea r1, [r3*3] lea r2, [r0+r3*4] - PRED4x4_LOWPASS m0, m2, m1, m3 + PRED4x4_LOWPASS m0, m2, m1, m0 mova [r0+r3*1], m0 mova [r0+r3*2], m0 mova [r0+r1*1], m0 @@ -765,70 +670,56 @@ cglobal pred8x8l_vertical_10_%1, 4,4,6 %endmacro INIT_XMM -%define PALIGNR PALIGNR_MMX PRED8x8L_VERTICAL sse2 -%define PALIGNR PALIGNR_SSSE3 -PRED8x8L_VERTICAL ssse3 +%ifdef HAVE_AVX +INIT_AVX +PRED8x8L_VERTICAL avx +%endif ;----------------------------------------------------------------------------- ; void pred8x8l_horizontal(uint8_t *src, int has_topleft, int has_topright, int stride) ;----------------------------------------------------------------------------- %macro PRED8x8L_HORIZONTAL 1 -cglobal pred8x8l_horizontal_10_%1, 4,4,8 - sub r0, r3 - lea r2, [r0+r3*2] - mova m0, [r0+r3*1-16] - test r1, r1 - lea r1, [r0+r3] - cmovnz r1, r0 - punpckhwd m0, [r1+r3*0-16] - mova m1, [r2+r3*1-16] - punpckhwd m1, [r0+r3*2-16] - mov r2, r0 +cglobal pred8x8l_horizontal_10_%1, 4,4,5 + mova m0, [r0-16] + shr r1d, 14 + dec r1 + and r1, r3 + sub r1, r3 + punpckhwd m0, [r0+r1-16] + mova m1, [r0+r3*2-16] + punpckhwd m1, [r0+r3*1-16] + lea r2, [r0+r3*4] + lea r1, [r3*3] punpckhdq m1, m0 - lea r0, [r0+r3*4] - mova m2, [r0+r3*1-16] - punpckhwd m2, [r0+r3*0-16] - lea r0, [r0+r3*2] - mova m3, [r0+r3*1-16] - punpckhwd m3, [r0+r3*0-16] + mova m2, [r2+r3*0-16] + punpckhwd m2, [r0+r1-16] + mova m3, [r2+r3*2-16] + punpckhwd m3, [r2+r3*1-16] punpckhdq m3, m2 punpckhqdq m3, m1 - lea r0, [r0+r3*2] - mova m0, [r0+r3*0-16] - mova m1, [r1+r3*0-16] - mov r0, r2 - mova m4, m3 - mova m2, m3 - PALIGNR m4, m0, 14, m0 - PALIGNR m1, m2, 2, m2 - mova m0, m4 - PRED4x4_LOWPASS m2, m1, m4, m3 - mova m4, m0 - mova m7, m2 - PRED4x4_LOWPASS m1, m3, m0, m4 - pslldq m1, 14 - PALIGNR m7, m1, 14, m3 - lea r1, [r3+r3*2] - punpckhwd m3, m7, m7 - punpcklwd m7, m7 + PALIGNR m4, m3, [r2+r1-16], 14, m0 + pslldq m0, m4, 2 + pshuflw m0, m0, 11100101b + PRED4x4_LOWPASS m4, m3, m0, m4 + punpckhwd m3, m4, m4 + punpcklwd m4, m4 pshufd m0, m3, 0xff pshufd m1, m3, 0xaa - lea r2, [r0+r3*4] pshufd m2, m3, 0x55 pshufd m3, m3, 0x00 - pshufd m4, m7, 0xff - pshufd m5, m7, 0xaa - pshufd m6, m7, 0x55 - pshufd m7, m7, 0x00 - mova [r0+r3*1], m0 - mova [r0+r3*2], m1 - mova [r0+r1*1], m2 - mova [r0+r3*4], m3 - mova [r2+r3*1], m4 - mova [r2+r3*2], m5 - mova [r2+r1*1], m6 - mova [r2+r3*4], m7 + mova [r0+r3*0], m0 + mova [r0+r3*1], m1 + mova [r0+r3*2], m2 + mova [r0+r1*1], m3 + pshufd m0, m4, 0xff + pshufd m1, m4, 0xaa + pshufd m2, m4, 0x55 + pshufd m3, m4, 0x00 + mova [r2+r3*0], m0 + mova [r2+r3*1], m1 + mova [r2+r3*2], m2 + mova [r2+r1*1], m3 RET %endmacro @@ -837,116 +728,68 @@ INIT_XMM PRED8x8L_HORIZONTAL sse2 %define PALIGNR PALIGNR_SSSE3 PRED8x8L_HORIZONTAL ssse3 +%ifdef HAVE_AVX +INIT_AVX +PRED8x8L_HORIZONTAL avx +%endif ;----------------------------------------------------------------------------- ;void pred8x8l_down_left(pixel *src, int has_topleft, int has_topright, int stride) ;----------------------------------------------------------------------------- %macro PRED8x8L_DOWN_LEFT 1 -cglobal pred8x8l_down_left_10_%1, 4,4,8 +cglobal pred8x8l_down_left_10_%1, 4,4,7 sub r0, r3 - mova m0, [r0-16] mova m3, [r0] + shr r1d, 14 + neg r1 + shr r2d, 13 + pslldq m1, m3, 2 + psrldq m2, m3, 2 + pinsrw m1, [r0+r1], 0 + pinsrw m2, [r0+r2+14], 7 + PRED4x4_LOWPASS m6, m2, m1, m3 + jz .fix_tr ; flags from shr r2d mova m1, [r0+16] - mova m2, m3 - mova m4, m3 - PALIGNR m2, m0, 14, m0 - PALIGNR m1, m4, 2, m4 - test r1, r1 - jz .fix_lt_2 - test r2, r2 - jz .fix_tr_1 - jmp .do_top -.fix_lt_2: - mova m5, m3 - pxor m5, m2 - pslldq m5, 14 - psrldq m5, 14 - pxor m2, m5 - test r2, r2 - jnz .do_top -.fix_tr_1: - mova m5, m3 - pxor m5, m1 - psrldq m5, 14 - pslldq m5, 14 - pxor m1, m5 - jmp .do_top -.fix_tr_2: - punpckhwd m3, m3 - pshufd m1, m3, 0xFF - jmp .do_topright -.do_top: - PRED4x4_LOWPASS m4, m2, m1, m3 - mova m7, m4 - test r2, r2 - jz .fix_tr_2 - mova m0, [r0+16] - mova m5, m0 - mova m2, m0 - mova m4, m0 - psrldq m5, 14 - PALIGNR m2, m3, 14, m3 - PALIGNR m5, m4, 2, m4 - PRED4x4_LOWPASS m1, m2, m5, m0 + psrldq m5, m1, 2 + PALIGNR m2, m1, m3, 14, m3 + pshufhw m5, m5, 10100100b + PRED4x4_LOWPASS m1, m2, m5, m1 .do_topright: - lea r1, [r3+r3*2] - mova m6, m1 - psrldq m1, 14 - mova m4, m1 + lea r1, [r3*3] + psrldq m5, m1, 14 lea r2, [r0+r3*4] - mova m2, m6 - PALIGNR m2, m7, 2, m0 - mova m3, m6 - PALIGNR m3, m7, 14, m0 - PALIGNR m4, m6, 2, m0 - mova m5, m7 - mova m1, m7 - mova m7, m6 - pslldq m1, 2 - PRED4x4_LOWPASS m0, m1, m2, m5 - PRED4x4_LOWPASS m1, m3, m4, m7 + PALIGNR m2, m1, m6, 2, m0 + PALIGNR m3, m1, m6, 14, m0 + PALIGNR m5, m1, 2, m0 + pslldq m4, m6, 2 + PRED4x4_LOWPASS m6, m4, m2, m6 + PRED4x4_LOWPASS m1, m3, m5, m1 mova [r2+r3*4], m1 - mova m2, m0 - pslldq m1, 2 - psrldq m2, 14 - pslldq m0, 2 - por m1, m2 + PALIGNR m1, m6, 14, m2 + pslldq m6, 2 mova [r2+r1*1], m1 - mova m2, m0 - pslldq m1, 2 - psrldq m2, 14 - pslldq m0, 2 - por m1, m2 + PALIGNR m1, m6, 14, m2 + pslldq m6, 2 mova [r2+r3*2], m1 - mova m2, m0 - pslldq m1, 2 - psrldq m2, 14 - pslldq m0, 2 - por m1, m2 + PALIGNR m1, m6, 14, m2 + pslldq m6, 2 mova [r2+r3*1], m1 - mova m2, m0 - pslldq m1, 2 - psrldq m2, 14 - pslldq m0, 2 - por m1, m2 + PALIGNR m1, m6, 14, m2 + pslldq m6, 2 mova [r0+r3*4], m1 - mova m2, m0 - pslldq m1, 2 - psrldq m2, 14 - pslldq m0, 2 - por m1, m2 + PALIGNR m1, m6, 14, m2 + pslldq m6, 2 mova [r0+r1*1], m1 - mova m2, m0 - pslldq m1, 2 - psrldq m2, 14 - pslldq m0, 2 - por m1, m2 + PALIGNR m1, m6, 14, m2 + pslldq m6, 2 mova [r0+r3*2], m1 - pslldq m1, 2 - psrldq m0, 14 - por m1, m0 + PALIGNR m1, m6, 14, m6 mova [r0+r3*1], m1 RET +.fix_tr: + punpckhwd m3, m3 + pshufd m1, m3, 0xFF + jmp .do_topright %endmacro INIT_XMM @@ -954,139 +797,73 @@ INIT_XMM PRED8x8L_DOWN_LEFT sse2 %define PALIGNR PALIGNR_SSSE3 PRED8x8L_DOWN_LEFT ssse3 +%ifdef HAVE_AVX +INIT_AVX +PRED8x8L_DOWN_LEFT avx +%endif ;----------------------------------------------------------------------------- -;void pred8x8l_down_right_mxext(pixel *src, int has_topleft, int has_topright, int stride) +;void pred8x8l_down_right(pixel *src, int has_topleft, int has_topright, int stride) ;----------------------------------------------------------------------------- %macro PRED8x8L_DOWN_RIGHT 1 +; standard forbids this when has_topleft is false +; no need to check cglobal pred8x8l_down_right_10_%1, 4,5,8 sub r0, r3 - lea r4, [r0+r3*2] + lea r4, [r0+r3*4] + lea r1, [r3*3] mova m0, [r0+r3*1-16] punpckhwd m0, [r0+r3*0-16] - mova m1, [r4+r3*1-16] + mova m1, [r0+r1*1-16] punpckhwd m1, [r0+r3*2-16] - mov r4, r0 punpckhdq m1, m0 - lea r0, [r0+r3*4] - mova m2, [r0+r3*1-16] - punpckhwd m2, [r0+r3*0-16] - lea r0, [r0+r3*2] - mova m3, [r0+r3*1-16] - punpckhwd m3, [r0+r3*0-16] + mova m2, [r4+r3*1-16] + punpckhwd m2, [r4+r3*0-16] + mova m3, [r4+r1*1-16] + punpckhwd m3, [r4+r3*2-16] punpckhdq m3, m2 punpckhqdq m3, m1 - lea r0, [r0+r3*2] - mova m0, [r0+r3*0-16] - mova m1, [r4] - mov r0, r4 - mova m4, m3 - mova m2, m3 - PALIGNR m4, m0, 14, m0 - PALIGNR m1, m2, 2, m2 - test r1, r1 ; top_left - jz .fix_lt_1 -.do_left: - mova m0, m4 - PRED4x4_LOWPASS m2, m1, m4, m3 - mova m4, m0 - mova m7, m2 - mova m6, m2 - PRED4x4_LOWPASS m1, m3, m0, m4 - pslldq m1, 14 - PALIGNR m7, m1, 14, m3 - mova m0, [r0-16] + mova m0, [r4+r3*4-16] + mova m1, [r0] + PALIGNR m4, m3, m0, 14, m0 + PALIGNR m1, m3, 2, m2 + pslldq m0, m4, 2 + pshuflw m0, m0, 11100101b + PRED4x4_LOWPASS m6, m1, m4, m3 + PRED4x4_LOWPASS m4, m3, m0, m4 mova m3, [r0] - mova m1, [r0+16] - mova m2, m3 - mova m4, m3 - PALIGNR m2, m0, 14, m0 - PALIGNR m1, m4, 2, m4 - test r1, r1 ; top_left - jz .fix_lt_2 - test r2, r2 ; top_right - jz .fix_tr_1 -.do_top: - PRED4x4_LOWPASS m4, m2, m1, m3 - mova m5, m4 - jmp .body -.fix_lt_1: - mova m5, m3 - pxor m5, m4 - psrldq m5, 14 - pslldq m5, 12 - pxor m1, m5 - jmp .do_left -.fix_lt_2: - mova m5, m3 - pxor m5, m2 - pslldq m5, 14 - psrldq m5, 14 - pxor m2, m5 - test r2, r2 ; top_right - jnz .do_top -.fix_tr_1: - mova m5, m3 - pxor m5, m1 - psrldq m5, 14 - pslldq m5, 14 - pxor m1, m5 - jmp .do_top -.body - lea r1, [r3+r3*2] - mova m1, m7 - mova m7, m5 - mova m5, m6 - mova m2, m7 - lea r2, [r0+r3*4] - PALIGNR m2, m6, 2, m0 - mova m3, m7 - PALIGNR m3, m6, 14, m0 - mova m4, m7 - psrldq m4, 2 - PRED4x4_LOWPASS m0, m1, m2, m5 - PRED4x4_LOWPASS m1, m3, m4, m7 - mova [r2+r3*4], m0 - mova m2, m1 - psrldq m0, 2 - pslldq m2, 14 - psrldq m1, 2 - por m0, m2 - mova [r2+r1*1], m0 - mova m2, m1 - psrldq m0, 2 - pslldq m2, 14 - psrldq m1, 2 - por m0, m2 - mova [r2+r3*2], m0 - mova m2, m1 - psrldq m0, 2 - pslldq m2, 14 - psrldq m1, 2 - por m0, m2 - mova [r2+r3*1], m0 - mova m2, m1 - psrldq m0, 2 - pslldq m2, 14 - psrldq m1, 2 - por m0, m2 - mova [r0+r3*4], m0 - mova m2, m1 - psrldq m0, 2 - pslldq m2, 14 - psrldq m1, 2 - por m0, m2 - mova [r0+r1*1], m0 - mova m2, m1 - psrldq m0, 2 - pslldq m2, 14 - psrldq m1, 2 - por m0, m2 - mova [r0+r3*2], m0 - psrldq m0, 2 - pslldq m1, 14 - por m0, m1 - mova [r0+r3*1], m0 + shr r2d, 13 + pslldq m1, m3, 2 + psrldq m2, m3, 2 + pinsrw m1, [r0-2], 0 + pinsrw m2, [r0+r2+14], 7 + PRED4x4_LOWPASS m3, m2, m1, m3 + PALIGNR m2, m3, m6, 2, m0 + PALIGNR m5, m3, m6, 14, m0 + psrldq m7, m3, 2 + PRED4x4_LOWPASS m6, m4, m2, m6 + PRED4x4_LOWPASS m3, m5, m7, m3 + mova [r4+r3*4], m6 + PALIGNR m3, m6, 14, m2 + pslldq m6, 2 + mova [r0+r3*1], m3 + PALIGNR m3, m6, 14, m2 + pslldq m6, 2 + mova [r0+r3*2], m3 + PALIGNR m3, m6, 14, m2 + pslldq m6, 2 + mova [r0+r1*1], m3 + PALIGNR m3, m6, 14, m2 + pslldq m6, 2 + mova [r0+r3*4], m3 + PALIGNR m3, m6, 14, m2 + pslldq m6, 2 + mova [r4+r3*1], m3 + PALIGNR m3, m6, 14, m2 + pslldq m6, 2 + mova [r4+r3*2], m3 + PALIGNR m3, m6, 14, m6 + mova [r4+r1*1], m3 RET %endmacro @@ -1095,114 +872,69 @@ INIT_XMM PRED8x8L_DOWN_RIGHT sse2 %define PALIGNR PALIGNR_SSSE3 PRED8x8L_DOWN_RIGHT ssse3 +%ifdef HAVE_AVX +INIT_AVX +PRED8x8L_DOWN_RIGHT avx +%endif ;----------------------------------------------------------------------------- ; void pred8x8l_vertical_right(pixel *src, int has_topleft, int has_topright, int stride) ;----------------------------------------------------------------------------- %macro PRED8x8L_VERTICAL_RIGHT 1 -cglobal pred8x8l_vertical_right_10_%1, 4,5,8 +; likewise with 8x8l_down_right +cglobal pred8x8l_vertical_right_10_%1, 4,5,7 sub r0, r3 - lea r4, [r0+r3*2] + lea r4, [r0+r3*4] + lea r1, [r3*3] mova m0, [r0+r3*1-16] punpckhwd m0, [r0+r3*0-16] - mova m1, [r4+r3*1-16] + mova m1, [r0+r1*1-16] punpckhwd m1, [r0+r3*2-16] - mov r4, r0 punpckhdq m1, m0 - lea r0, [r0+r3*4] - mova m2, [r0+r3*1-16] - punpckhwd m2, [r0+r3*0-16] - lea r0, [r0+r3*2] - mova m3, [r0+r3*1-16] - punpckhwd m3, [r0+r3*0-16] + mova m2, [r4+r3*1-16] + punpckhwd m2, [r4+r3*0-16] + mova m3, [r4+r1*1-16] + punpckhwd m3, [r4+r3*2-16] punpckhdq m3, m2 punpckhqdq m3, m1 - lea r0, [r0+r3*2] - mova m0, [r0+r3*0-16] - mova m1, [r4] - mov r0, r4 - mova m4, m3 - mova m2, m3 - PALIGNR m4, m0, 14, m0 - PALIGNR m1, m2, 2, m2 - test r1, r1 - jz .fix_lt_1 - jmp .do_left -.fix_lt_1: - mova m5, m3 - pxor m5, m4 - psrldq m5, 14 - pslldq m5, 12 - pxor m1, m5 - jmp .do_left -.fix_lt_2: - mova m5, m3 - pxor m5, m2 - pslldq m5, 14 - psrldq m5, 14 - pxor m2, m5 - test r2, r2 - jnz .do_top -.fix_tr_1: - mova m5, m3 - pxor m5, m1 - psrldq m5, 14 - pslldq m5, 14 - pxor m1, m5 - jmp .do_top -.do_left: - mova m0, m4 - PRED4x4_LOWPASS m2, m1, m4, m3 - mova m7, m2 - mova m0, [r0-16] - mova m3, [r0] - mova m1, [r0+16] - mova m2, m3 - mova m4, m3 - PALIGNR m2, m0, 14, m0 - PALIGNR m1, m4, 2, m4 - test r1, r1 - jz .fix_lt_2 - test r2, r2 - jz .fix_tr_1 -.do_top - PRED4x4_LOWPASS m6, m2, m1, m3 - lea r1, [r3+r3*2] - mova m2, m6 - mova m3, m6 - PALIGNR m3, m7, 14, m0 - PALIGNR m6, m7, 12, m1 - mova m4, m3 - pavgw m3, m2 - lea r2, [r0+r3*4] - PRED4x4_LOWPASS m0, m6, m2, m4 - mova [r0+r3*1], m3 + mova m0, [r4+r3*4-16] + mova m1, [r0] + PALIGNR m4, m3, m0, 14, m0 + PALIGNR m1, m3, 2, m2 + PRED4x4_LOWPASS m3, m1, m4, m3 + mova m2, [r0] + shr r2d, 13 + pslldq m1, m2, 2 + psrldq m5, m2, 2 + pinsrw m1, [r0-2], 0 + pinsrw m5, [r0+r2+14], 7 + PRED4x4_LOWPASS m2, m5, m1, m2 + PALIGNR m6, m2, m3, 12, m1 + PALIGNR m5, m2, m3, 14, m0 + PRED4x4_LOWPASS m0, m6, m2, m5 + pavgw m2, m5 mova [r0+r3*2], m0 - mova m5, m0 - mova m6, m3 - mova m1, m7 - mova m2, m1 - pslldq m2, 2 - mova m3, m1 - pslldq m3, 4 - PRED4x4_LOWPASS m0, m1, m3, m2 - PALIGNR m6, m0, 14, m2 - mova [r0+r1*1], m6 - pslldq m0, 2 - PALIGNR m5, m0, 14, m1 - mova [r0+r3*4], m5 - pslldq m0, 2 - PALIGNR m6, m0, 14, m2 - mova [r2+r3*1], m6 - pslldq m0, 2 - PALIGNR m5, m0, 14, m1 - mova [r2+r3*2], m5 - pslldq m0, 2 - PALIGNR m6, m0, 14, m2 - mova [r2+r1*1], m6 - pslldq m0, 2 - PALIGNR m5, m0, 14, m1 - mova [r2+r3*4], m5 + mova [r0+r3*1], m2 + pslldq m6, m3, 4 + pslldq m1, m3, 2 + PRED4x4_LOWPASS m1, m3, m6, m1 + PALIGNR m2, m1, 14, m4 + mova [r0+r1*1], m2 + pslldq m1, 2 + PALIGNR m0, m1, 14, m3 + mova [r0+r3*4], m0 + pslldq m1, 2 + PALIGNR m2, m1, 14, m4 + mova [r4+r3*1], m2 + pslldq m1, 2 + PALIGNR m0, m1, 14, m3 + mova [r4+r3*2], m0 + pslldq m1, 2 + PALIGNR m2, m1, 14, m4 + mova [r4+r1*1], m2 + pslldq m1, 2 + PALIGNR m0, m1, 14, m1 + mova [r4+r3*4], m0 RET %endmacro @@ -1211,84 +943,60 @@ INIT_XMM PRED8x8L_VERTICAL_RIGHT sse2 %define PALIGNR PALIGNR_SSSE3 PRED8x8L_VERTICAL_RIGHT ssse3 +%ifdef HAVE_AVX +INIT_AVX +PRED8x8L_VERTICAL_RIGHT avx +%endif ;----------------------------------------------------------------------------- ; void pred8x8l_horizontal_up(pixel *src, int has_topleft, int has_topright, int stride) ;----------------------------------------------------------------------------- %macro PRED8x8L_HORIZONTAL_UP 1 -cglobal pred8x8l_horizontal_up_10_%1, 4,4,8 - sub r0, r3 - lea r2, [r0+r3*2] - mova m0, [r0+r3*1-16] - test r1, r1 - lea r1, [r0+r3] - cmovnz r1, r0 - punpckhwd m0, [r1+r3*0-16] - mova m1, [r2+r3*1-16] - punpckhwd m1, [r0+r3*2-16] - mov r2, r0 - punpckhdq m1, m0 - lea r0, [r0+r3*4] - mova m2, [r0+r3*1-16] - punpckhwd m2, [r0+r3*0-16] - lea r0, [r0+r3*2] - mova m3, [r0+r3*1-16] - punpckhwd m3, [r0+r3*0-16] - punpckhdq m3, m2 - punpckhqdq m3, m1 - lea r0, [r0+r3*2] +cglobal pred8x8l_horizontal_up_10_%1, 4,4,6 mova m0, [r0+r3*0-16] - mova m1, [r1+r3*0-16] - mov r0, r2 - mova m4, m3 - mova m2, m3 - PALIGNR m4, m0, 14, m0 - PALIGNR m1, m2, 2, m2 - mova m0, m4 - PRED4x4_LOWPASS m2, m1, m4, m3 - mova m4, m0 - mova m7, m2 - PRED4x4_LOWPASS m1, m3, m0, m4 - pslldq m1, 14 - PALIGNR m7, m1, 14, m3 - lea r1, [r3+r3*2] - pshufd m0, m7, 00011011b ; l6 l7 l4 l5 l2 l3 l0 l1 - pslldq m7, 14 ; l7 .. .. .. .. .. .. .. - mova m2, m0 - pslld m0, 16 - psrld m2, 16 - por m2, m0 ; l7 l6 l5 l4 l3 l2 l1 l0 - mova m3, m2 - mova m4, m2 - mova m5, m2 - psrldq m2, 2 - psrldq m3, 4 + punpckhwd m0, [r0+r3*1-16] + shr r1d, 14 + dec r1 + and r1, r3 + sub r1, r3 + mova m4, [r0+r1*1-16] + lea r1, [r3*3] lea r2, [r0+r3*4] - por m2, m7 ; l7 l7 l6 l5 l4 l3 l2 l1 - punpckhwd m7, m7 - por m3, m7 ; l7 l7 l7 l6 l5 l4 l3 l2 - pavgw m4, m2 - PRED4x4_LOWPASS m1, m3, m5, m2 - mova m5, m4 - punpcklwd m4, m1 ; p4 p3 p2 p1 - punpckhwd m5, m1 ; p8 p7 p6 p5 - mova m6, m5 - mova m7, m5 - mova m0, m5 - PALIGNR m5, m4, 4, m1 - pshufd m1, m6, 11111001b - PALIGNR m6, m4, 8, m2 - pshufd m2, m7, 11111110b - PALIGNR m7, m4, 12, m3 - pshufd m3, m0, 11111111b - mova [r0+r3*1], m4 - mova [r0+r3*2], m5 - mova [r0+r1*1], m6 - mova [r0+r3*4], m7 + mova m1, [r0+r3*2-16] + punpckhwd m1, [r0+r1*1-16] + punpckhdq m0, m1 + mova m2, [r2+r3*0-16] + punpckhwd m2, [r2+r3*1-16] + mova m3, [r2+r3*2-16] + punpckhwd m3, [r2+r1*1-16] + punpckhdq m2, m3 + punpckhqdq m0, m2 + PALIGNR m1, m0, m4, 14, m4 + psrldq m2, m0, 2 + pshufhw m2, m2, 10100100b + PRED4x4_LOWPASS m0, m1, m2, m0 + psrldq m1, m0, 2 + psrldq m2, m0, 4 + pshufhw m1, m1, 10100100b + pshufhw m2, m2, 01010100b + pavgw m4, m0, m1 + PRED4x4_LOWPASS m1, m2, m0, m1 + punpckhwd m5, m4, m1 + punpcklwd m4, m1 + mova [r2+r3*0], m5 + mova [r0+r3*0], m4 + pshufd m0, m5, 11111001b + pshufd m1, m5, 11111110b + pshufd m2, m5, 11111111b mova [r2+r3*1], m0 mova [r2+r3*2], m1 mova [r2+r1*1], m2 - mova [r2+r3*4], m3 + PALIGNR m2, m5, m4, 4, m0 + PALIGNR m3, m5, m4, 8, m1 + PALIGNR m5, m5, m4, 12, m4 + mova [r0+r3*1], m2 + mova [r0+r3*2], m3 + mova [r0+r1*1], m5 RET %endmacro @@ -1297,7 +1005,10 @@ INIT_XMM PRED8x8L_HORIZONTAL_UP sse2 %define PALIGNR PALIGNR_SSSE3 PRED8x8L_HORIZONTAL_UP ssse3 - +%ifdef HAVE_AVX +INIT_AVX +PRED8x8L_HORIZONTAL_UP avx +%endif ;----------------------------------------------------------------------------- @@ -1315,7 +1026,7 @@ PRED8x8L_HORIZONTAL_UP ssse3 %macro PRED16x16_VERTICAL 1 cglobal pred16x16_vertical_10_%1, 2,3 sub r0, r1 - mov r2, 8 + mov r2d, 8 mova m0, [r0+ 0] mova m1, [r0+mmsize] %if mmsize==8 @@ -1326,7 +1037,7 @@ cglobal pred16x16_vertical_10_%1, 2,3 MOV16 r0+r1*1, m0, m1, m2, m3 MOV16 r0+r1*2, m0, m1, m2, m3 lea r0, [r0+r1*2] - dec r2 + dec r2d jg .loop REP_RET %endmacro @@ -1341,7 +1052,7 @@ PRED16x16_VERTICAL sse2 ;----------------------------------------------------------------------------- %macro PRED16x16_HORIZONTAL 1 cglobal pred16x16_horizontal_10_%1, 2,3 - mov r2, 8 + mov r2d, 8 .vloop: movd m0, [r0+r1*0-4] movd m1, [r0+r1*1-4] @@ -1350,7 +1061,7 @@ cglobal pred16x16_horizontal_10_%1, 2,3 MOV16 r0+r1*0, m0, m0, m0, m0 MOV16 r0+r1*1, m1, m1, m1, m1 lea r0, [r0+r1*2] - dec r2 + dec r2d jg .vloop REP_RET %endmacro @@ -1364,8 +1075,8 @@ PRED16x16_HORIZONTAL sse2 ; void pred16x16_dc(pixel *src, int stride) ;----------------------------------------------------------------------------- %macro PRED16x16_DC 1 -cglobal pred16x16_dc_10_%1, 2,7 - mov r4, r0 +cglobal pred16x16_dc_10_%1, 2,6 + mov r5, r0 sub r0, r1 mova m0, [r0+0] paddw m0, [r0+mmsize] @@ -1375,17 +1086,17 @@ cglobal pred16x16_dc_10_%1, 2,7 %endif HADDW m0, m2 - sub r0, 2 - movzx r3d, word [r0+r1*1] - movzx r5d, word [r0+r1*2] + lea r0, [r0+r1-2] + movzx r3d, word [r0] + movzx r4d, word [r0+r1] %rep 7 lea r0, [r0+r1*2] - movzx r2d, word [r0+r1*1] + movzx r2d, word [r0] add r3d, r2d - movzx r2d, word [r0+r1*2] - add r5d, r2d + movzx r2d, word [r0+r1] + add r4d, r2d %endrep - lea r3d, [r3+r5+16] + lea r3d, [r3+r4+16] movd m1, r3d paddw m0, m1 @@ -1393,9 +1104,9 @@ cglobal pred16x16_dc_10_%1, 2,7 SPLATW m0, m0 mov r3d, 8 .loop: - MOV16 r4+r1*0, m0, m0, m0, m0 - MOV16 r4+r1*1, m0, m0, m0, m0 - lea r4, [r4+r1*2] + MOV16 r5+r1*0, m0, m0, m0, m0 + MOV16 r5+r1*1, m0, m0, m0, m0 + lea r5, [r5+r1*2] dec r3d jg .loop REP_RET @@ -1442,29 +1153,29 @@ PRED16x16_TOP_DC sse2 ; void pred16x16_left_dc(pixel *src, int stride) ;----------------------------------------------------------------------------- %macro PRED16x16_LEFT_DC 1 -cglobal pred16x16_left_dc_10_%1, 2,7 - mov r4, r0 +cglobal pred16x16_left_dc_10_%1, 2,6 + mov r5, r0 sub r0, 2 - movzx r5d, word [r0+r1*0] - movzx r6d, word [r0+r1*1] + movzx r3d, word [r0] + movzx r4d, word [r0+r1] %rep 7 lea r0, [r0+r1*2] - movzx r2d, word [r0+r1*0] - movzx r3d, word [r0+r1*1] - add r5d, r2d - add r6d, r3d + movzx r2d, word [r0] + add r3d, r2d + movzx r2d, word [r0+r1] + add r4d, r2d %endrep - lea r2d, [r5+r6+8] - shr r2d, 4 + lea r3d, [r3+r4+8] + shr r3d, 4 - movd m0, r2d + movd m0, r3d SPLATW m0, m0 mov r3d, 8 .loop: - MOV16 r4+r1*0, m0, m0, m0, m0 - MOV16 r4+r1*1, m0, m0, m0, m0 - lea r4, [r4+r1*2] + MOV16 r5+r1*0, m0, m0, m0, m0 + MOV16 r5+r1*1, m0, m0, m0, m0 + lea r5, [r5+r1*2] dec r3d jg .loop REP_RET diff --git a/libavcodec/x86/h264_intrapred_init.c b/libavcodec/x86/h264_intrapred_init.c index 62e4c8796b..55387f623e 100644 --- a/libavcodec/x86/h264_intrapred_init.c +++ b/libavcodec/x86/h264_intrapred_init.c @@ -45,7 +45,6 @@ void ff_pred8x8_ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *src, int stride); PRED8x8(dc, 10, mmxext) PRED8x8(dc, 10, sse2) -PRED8x8(top_dc, 10, mmxext) PRED8x8(top_dc, 10, sse2) PRED8x8(plane, 10, sse2) PRED8x8(vertical, 10, sse2) @@ -55,23 +54,28 @@ PRED8x8(horizontal, 10, sse2) void ff_pred8x8l_ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *src, int has_topleft, int has_topright, int stride); PRED8x8L(dc, 10, sse2) -PRED8x8L(dc, 10, ssse3) +PRED8x8L(dc, 10, avx) PRED8x8L(128_dc, 10, mmxext) PRED8x8L(128_dc, 10, sse2) PRED8x8L(top_dc, 10, sse2) -PRED8x8L(top_dc, 10, ssse3) +PRED8x8L(top_dc, 10, avx) PRED8x8L(vertical, 10, sse2) -PRED8x8L(vertical, 10, ssse3) +PRED8x8L(vertical, 10, avx) PRED8x8L(horizontal, 10, sse2) PRED8x8L(horizontal, 10, ssse3) +PRED8x8L(horizontal, 10, avx) PRED8x8L(down_left, 10, sse2) PRED8x8L(down_left, 10, ssse3) +PRED8x8L(down_left, 10, avx) PRED8x8L(down_right, 10, sse2) PRED8x8L(down_right, 10, ssse3) +PRED8x8L(down_right, 10, avx) PRED8x8L(vertical_right, 10, sse2) PRED8x8L(vertical_right, 10, ssse3) +PRED8x8L(vertical_right, 10, avx) PRED8x8L(horizontal_up, 10, sse2) PRED8x8L(horizontal_up, 10, ssse3) +PRED8x8L(horizontal_up, 10, avx) #define PRED16x16(TYPE, DEPTH, OPT)\ void ff_pred16x16_ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *src, int stride); @@ -298,7 +302,6 @@ void ff_h264_pred_init_x86(H264PredContext *h, int codec_id, const int bit_depth h->pred4x4[HOR_UP_PRED ] = ff_pred4x4_horizontal_up_10_mmxext; h->pred8x8[DC_PRED8x8 ] = ff_pred8x8_dc_10_mmxext; - h->pred8x8[TOP_DC_PRED8x8 ] = ff_pred8x8_top_dc_10_mmxext; h->pred8x8l[DC_128_PRED ] = ff_pred8x8l_128_dc_10_mmxext; @@ -344,18 +347,28 @@ void ff_h264_pred_init_x86(H264PredContext *h, int codec_id, const int bit_depth h->pred4x4[VERT_RIGHT_PRED ] = ff_pred4x4_vertical_right_10_ssse3; h->pred4x4[HOR_DOWN_PRED ] = ff_pred4x4_horizontal_down_10_ssse3; - h->pred8x8l[VERT_PRED ] = ff_pred8x8l_vertical_10_ssse3; h->pred8x8l[HOR_PRED ] = ff_pred8x8l_horizontal_10_ssse3; - h->pred8x8l[DC_PRED ] = ff_pred8x8l_dc_10_ssse3; - h->pred8x8l[TOP_DC_PRED ] = ff_pred8x8l_top_dc_10_ssse3; h->pred8x8l[DIAG_DOWN_LEFT_PRED ] = ff_pred8x8l_down_left_10_ssse3; + h->pred8x8l[DIAG_DOWN_RIGHT_PRED] = ff_pred8x8l_down_right_10_ssse3; + h->pred8x8l[VERT_RIGHT_PRED ] = ff_pred8x8l_vertical_right_10_ssse3; + h->pred8x8l[HOR_UP_PRED ] = ff_pred8x8l_horizontal_up_10_ssse3; } #if HAVE_AVX if (mm_flags & AV_CPU_FLAG_AVX) { h->pred4x4[DIAG_DOWN_LEFT_PRED ] = ff_pred4x4_down_left_10_avx; h->pred4x4[DIAG_DOWN_RIGHT_PRED] = ff_pred4x4_down_right_10_avx; + h->pred4x4[VERT_LEFT_PRED ] = ff_pred4x4_vertical_left_10_avx; h->pred4x4[VERT_RIGHT_PRED ] = ff_pred4x4_vertical_right_10_avx; h->pred4x4[HOR_DOWN_PRED ] = ff_pred4x4_horizontal_down_10_avx; + + h->pred8x8l[VERT_PRED ] = ff_pred8x8l_vertical_10_avx; + h->pred8x8l[HOR_PRED ] = ff_pred8x8l_horizontal_10_avx; + h->pred8x8l[DC_PRED ] = ff_pred8x8l_dc_10_avx; + h->pred8x8l[TOP_DC_PRED ] = ff_pred8x8l_top_dc_10_avx; + h->pred8x8l[DIAG_DOWN_RIGHT_PRED] = ff_pred8x8l_down_right_10_avx; + h->pred8x8l[DIAG_DOWN_LEFT_PRED ] = ff_pred8x8l_down_left_10_avx; + h->pred8x8l[VERT_RIGHT_PRED ] = ff_pred8x8l_vertical_right_10_avx; + h->pred8x8l[HOR_UP_PRED ] = ff_pred8x8l_horizontal_up_10_avx; } #endif /* HAVE_AVX */ } From b4cfb8254eeeb2fc0aa2c0c36a5ede208af47a79 Mon Sep 17 00:00:00 2001 From: Mans Rullgard Date: Fri, 22 Jul 2011 00:13:27 +0100 Subject: [PATCH 4/4] dnxhddec: avoid a branch in 10-bit decode_dct_block() The minimum weight value is 32 so this test can be skipped for the 10-bit case. Overall speedup 3-4%. Signed-off-by: Mans Rullgard --- libavcodec/dnxhddec.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libavcodec/dnxhddec.c b/libavcodec/dnxhddec.c index 426be2e985..63ccd08b48 100644 --- a/libavcodec/dnxhddec.c +++ b/libavcodec/dnxhddec.c @@ -239,7 +239,7 @@ static av_always_inline void dnxhd_decode_dct_block(DNXHDContext *ctx, //av_log(ctx->avctx, AV_LOG_DEBUG, "j %d\n", j); //av_log(ctx->avctx, AV_LOG_DEBUG, "level %d, weight %d\n", level, weight_matrix[i]); level = (2*level+1) * qscale * weight_matrix[i]; - if (weight_matrix[i] != level_bias) + if (level_bias < 32 || weight_matrix[i] != level_bias) level += level_bias; level >>= level_shift;