From cbbec68847ed3485900e83ec231871f71bb97d0d Mon Sep 17 00:00:00 2001 From: Martin Vignali Date: Mon, 2 Oct 2017 01:29:32 +0200 Subject: [PATCH] libavcodec/blockdsp : add AVX version Also modify the required alignment, to 32 instead of 16 for several codecs Signed-off-by: James Almer --- libavcodec/asv.h | 2 +- libavcodec/bink.c | 4 ++-- libavcodec/dnxhdenc.h | 2 +- libavcodec/eamad.c | 2 +- libavcodec/eatqi.c | 2 +- libavcodec/g2meet.c | 2 +- libavcodec/ituh263dec.c | 2 +- libavcodec/mdec.c | 2 +- libavcodec/mimic.c | 2 +- libavcodec/mjpegdec.h | 2 +- libavcodec/proresdec2.c | 6 +++--- libavcodec/speedhq.c | 2 +- libavcodec/wmv2.h | 2 +- libavcodec/x86/blockdsp.asm | 14 ++++++++------ libavcodec/x86/blockdsp_init.c | 6 ++++++ tests/checkasm/blockdsp.c | 4 ++-- 16 files changed, 32 insertions(+), 24 deletions(-) diff --git a/libavcodec/asv.h b/libavcodec/asv.h index e2cdc81300..a1366b6fe4 100644 --- a/libavcodec/asv.h +++ b/libavcodec/asv.h @@ -54,7 +54,7 @@ typedef struct ASV1Context { int mb_height; int mb_width2; int mb_height2; - DECLARE_ALIGNED(16, int16_t, block)[6][64]; + DECLARE_ALIGNED(32, int16_t, block)[6][64]; uint16_t intra_matrix[64]; int q_intra_matrix[64]; uint8_t *bitstream_buffer; diff --git a/libavcodec/bink.c b/libavcodec/bink.c index cc55870114..346b6cda9d 100644 --- a/libavcodec/bink.c +++ b/libavcodec/bink.c @@ -813,7 +813,7 @@ static int binkb_decode_plane(BinkContext *c, AVFrame *frame, GetBitContext *gb, int v, col[2]; const uint8_t *scan; int xoff, yoff; - LOCAL_ALIGNED_16(int16_t, block, [64]); + LOCAL_ALIGNED_32(int16_t, block, [64]); LOCAL_ALIGNED_16(int32_t, dctblock, [64]); int coordmap[64]; int ybias = is_key ? -15 : 0; @@ -976,7 +976,7 @@ static int bink_decode_plane(BinkContext *c, AVFrame *frame, GetBitContext *gb, uint8_t *dst, *prev, *ref_start, *ref_end; int v, col[2]; const uint8_t *scan; - LOCAL_ALIGNED_16(int16_t, block, [64]); + LOCAL_ALIGNED_32(int16_t, block, [64]); LOCAL_ALIGNED_16(uint8_t, ublock, [64]); LOCAL_ALIGNED_16(int32_t, dctblock, [64]); int coordmap[64]; diff --git a/libavcodec/dnxhdenc.h b/libavcodec/dnxhdenc.h index 26c3eec695..963821ac81 100644 --- a/libavcodec/dnxhdenc.h +++ b/libavcodec/dnxhdenc.h @@ -74,7 +74,7 @@ typedef struct DNXHDEncContext { unsigned min_padding; int intra_quant_bias; - DECLARE_ALIGNED(16, int16_t, blocks)[12][64]; + DECLARE_ALIGNED(32, int16_t, blocks)[12][64]; DECLARE_ALIGNED(16, uint8_t, edge_buf_y)[512]; // has to hold 16x16 uint16 when depth=10 DECLARE_ALIGNED(16, uint8_t, edge_buf_uv)[2][512]; // has to hold 16x16 uint16_t when depth=10 diff --git a/libavcodec/eamad.c b/libavcodec/eamad.c index 753dee06c3..7f28abbafe 100644 --- a/libavcodec/eamad.c +++ b/libavcodec/eamad.c @@ -54,7 +54,7 @@ typedef struct MadContext { GetBitContext gb; void *bitstream_buf; unsigned int bitstream_buf_size; - DECLARE_ALIGNED(16, int16_t, block)[64]; + DECLARE_ALIGNED(32, int16_t, block)[64]; ScanTable scantable; uint16_t quant_matrix[64]; int mb_x; diff --git a/libavcodec/eatqi.c b/libavcodec/eatqi.c index 725289448a..1a847a35da 100644 --- a/libavcodec/eatqi.c +++ b/libavcodec/eatqi.c @@ -51,7 +51,7 @@ typedef struct TqiContext { uint16_t intra_matrix[64]; int last_dc[3]; - DECLARE_ALIGNED(16, int16_t, block)[6][64]; + DECLARE_ALIGNED(32, int16_t, block)[6][64]; } TqiContext; static av_cold int tqi_decode_init(AVCodecContext *avctx) diff --git a/libavcodec/g2meet.c b/libavcodec/g2meet.c index 10b6808f81..842095ba3b 100644 --- a/libavcodec/g2meet.c +++ b/libavcodec/g2meet.c @@ -122,7 +122,7 @@ typedef struct JPGContext { VLC dc_vlc[2], ac_vlc[2]; int prev_dc[3]; - DECLARE_ALIGNED(16, int16_t, block)[6][64]; + DECLARE_ALIGNED(32, int16_t, block)[6][64]; uint8_t *buf; } JPGContext; diff --git a/libavcodec/ituh263dec.c b/libavcodec/ituh263dec.c index edb68861ac..fc95a532ce 100644 --- a/libavcodec/ituh263dec.c +++ b/libavcodec/ituh263dec.c @@ -574,7 +574,7 @@ not_coded: static int h263_skip_b_part(MpegEncContext *s, int cbp) { - LOCAL_ALIGNED_16(int16_t, dblock, [64]); + LOCAL_ALIGNED_32(int16_t, dblock, [64]); int i, mbi; int bli[6]; diff --git a/libavcodec/mdec.c b/libavcodec/mdec.c index 59658b331d..330b761279 100644 --- a/libavcodec/mdec.c +++ b/libavcodec/mdec.c @@ -48,7 +48,7 @@ typedef struct MDECContext { int mb_width; int mb_height; int mb_x, mb_y; - DECLARE_ALIGNED(16, int16_t, block)[6][64]; + DECLARE_ALIGNED(32, int16_t, block)[6][64]; DECLARE_ALIGNED(16, uint16_t, quant_matrix)[64]; uint8_t *bitstream_buffer; unsigned int bitstream_buffer_size; diff --git a/libavcodec/mimic.c b/libavcodec/mimic.c index 02d8b30e31..1d463e9962 100644 --- a/libavcodec/mimic.c +++ b/libavcodec/mimic.c @@ -49,7 +49,7 @@ typedef struct MimicContext { ThreadFrame frames [16]; - DECLARE_ALIGNED(16, int16_t, dct_block)[64]; + DECLARE_ALIGNED(32, int16_t, dct_block)[64]; GetBitContext gb; ScanTable scantable; diff --git a/libavcodec/mjpegdec.h b/libavcodec/mjpegdec.h index 2bc69fa930..c84a40aa6e 100644 --- a/libavcodec/mjpegdec.h +++ b/libavcodec/mjpegdec.h @@ -98,7 +98,7 @@ typedef struct MJpegDecodeContext { int got_picture; ///< we found a SOF and picture is valid, too. int linesize[MAX_COMPONENTS]; ///< linesize << interlaced int8_t *qscale_table; - DECLARE_ALIGNED(16, int16_t, block)[64]; + DECLARE_ALIGNED(32, int16_t, block)[64]; int16_t (*blocks[MAX_COMPONENTS])[64]; ///< intermediate sums (progressive mode) uint8_t *last_nnz[MAX_COMPONENTS]; uint64_t coefs_finished[MAX_COMPONENTS]; ///< bitmask of which coefs have been completely decoded (progressive mode) diff --git a/libavcodec/proresdec2.c b/libavcodec/proresdec2.c index a7cea2b3b8..0f791de97b 100644 --- a/libavcodec/proresdec2.c +++ b/libavcodec/proresdec2.c @@ -368,7 +368,7 @@ static int decode_slice_luma(AVCodecContext *avctx, SliceContext *slice, const int16_t *qmat) { ProresContext *ctx = avctx->priv_data; - LOCAL_ALIGNED_16(int16_t, blocks, [8*4*64]); + LOCAL_ALIGNED_32(int16_t, blocks, [8*4*64]); int16_t *block; GetBitContext gb; int i, blocks_per_slice = slice->mb_count<<2; @@ -402,7 +402,7 @@ static int decode_slice_chroma(AVCodecContext *avctx, SliceContext *slice, const int16_t *qmat, int log2_blocks_per_mb) { ProresContext *ctx = avctx->priv_data; - LOCAL_ALIGNED_16(int16_t, blocks, [8*4*64]); + LOCAL_ALIGNED_32(int16_t, blocks, [8*4*64]); int16_t *block; GetBitContext gb; int i, j, blocks_per_slice = slice->mb_count << log2_blocks_per_mb; @@ -485,7 +485,7 @@ static void decode_slice_alpha(ProresContext *ctx, { GetBitContext gb; int i; - LOCAL_ALIGNED_16(int16_t, blocks, [8*4*64]); + LOCAL_ALIGNED_32(int16_t, blocks, [8*4*64]); int16_t *block; for (i = 0; i < blocks_per_slice<<2; i++) diff --git a/libavcodec/speedhq.c b/libavcodec/speedhq.c index 47b1e4dc7a..6d3487ca19 100644 --- a/libavcodec/speedhq.c +++ b/libavcodec/speedhq.c @@ -224,7 +224,7 @@ static inline int decode_dct_block(const SHQContext *s, GetBitContext *gb, int l { const int *quant_matrix = s->quant_matrix; const uint8_t *scantable = s->intra_scantable.permutated; - LOCAL_ALIGNED_16(int16_t, block, [64]); + LOCAL_ALIGNED_32(int16_t, block, [64]); int dc_offset; s->bdsp.clear_block(block); diff --git a/libavcodec/wmv2.h b/libavcodec/wmv2.h index 31593b8c38..0f459ae5ae 100644 --- a/libavcodec/wmv2.h +++ b/libavcodec/wmv2.h @@ -51,7 +51,7 @@ typedef struct Wmv2Context { int hshift; ScanTable abt_scantable[2]; - DECLARE_ALIGNED(16, int16_t, abt_block2)[6][64]; + DECLARE_ALIGNED(32, int16_t, abt_block2)[6][64]; } Wmv2Context; void ff_wmv2_common_init(Wmv2Context *w); diff --git a/libavcodec/x86/blockdsp.asm b/libavcodec/x86/blockdsp.asm index 7cbfa3a843..2498bd40b3 100644 --- a/libavcodec/x86/blockdsp.asm +++ b/libavcodec/x86/blockdsp.asm @@ -4,6 +4,8 @@ ;* Copyright (c) 2008 Loren Merritt ;* Copyright (c) 2009 Fiona Glaser ;* +;* AVX version by Jokyo Images +;* ;* This file is part of FFmpeg. ;* ;* FFmpeg is free software; you can redistribute it and/or @@ -39,20 +41,18 @@ cglobal clear_block, 1, 1, %1, blocks mova [blocksq+mmsize*(1+%%i)], m0 mova [blocksq+mmsize*(2+%%i)], m0 mova [blocksq+mmsize*(3+%%i)], m0 - mova [blocksq+mmsize*(4+%%i)], m0 - mova [blocksq+mmsize*(5+%%i)], m0 - mova [blocksq+mmsize*(6+%%i)], m0 - mova [blocksq+mmsize*(7+%%i)], m0 -%assign %%i %%i+8 +%assign %%i %%i+4 %endrep RET %endmacro INIT_MMX mmx %define ZERO pxor -CLEAR_BLOCK 0, 2 +CLEAR_BLOCK 0, 4 INIT_XMM sse %define ZERO xorps +CLEAR_BLOCK 1, 2 +INIT_YMM avx CLEAR_BLOCK 1, 1 ;----------------------------------------- @@ -84,3 +84,5 @@ CLEAR_BLOCKS 0 INIT_XMM sse %define ZERO xorps CLEAR_BLOCKS 1 +INIT_YMM avx +CLEAR_BLOCKS 1 diff --git a/libavcodec/x86/blockdsp_init.c b/libavcodec/x86/blockdsp_init.c index afd25e1cbb..8b01a447cd 100644 --- a/libavcodec/x86/blockdsp_init.c +++ b/libavcodec/x86/blockdsp_init.c @@ -28,8 +28,10 @@ void ff_clear_block_mmx(int16_t *block); void ff_clear_block_sse(int16_t *block); +void ff_clear_block_avx(int16_t *block); void ff_clear_blocks_mmx(int16_t *blocks); void ff_clear_blocks_sse(int16_t *blocks); +void ff_clear_blocks_avx(int16_t *blocks); av_cold void ff_blockdsp_init_x86(BlockDSPContext *c, AVCodecContext *avctx) @@ -50,5 +52,9 @@ av_cold void ff_blockdsp_init_x86(BlockDSPContext *c, c->clear_block = ff_clear_block_sse; c->clear_blocks = ff_clear_blocks_sse; } + if (EXTERNAL_AVX_FAST(cpu_flags)) { + c->clear_block = ff_clear_block_avx; + c->clear_blocks = ff_clear_blocks_avx; + } #endif /* HAVE_X86ASM */ } diff --git a/tests/checkasm/blockdsp.c b/tests/checkasm/blockdsp.c index 153699b632..c753506b3c 100644 --- a/tests/checkasm/blockdsp.c +++ b/tests/checkasm/blockdsp.c @@ -53,8 +53,8 @@ do { \ void checkasm_check_blockdsp(void) { - LOCAL_ALIGNED_16(uint16_t, buf0, [6 * 8 * 8]); - LOCAL_ALIGNED_16(uint16_t, buf1, [6 * 8 * 8]); + LOCAL_ALIGNED_32(uint16_t, buf0, [6 * 8 * 8]); + LOCAL_ALIGNED_32(uint16_t, buf1, [6 * 8 * 8]); AVCodecContext avctx = { 0 }; BlockDSPContext h;