libavcodec/blockdsp : add AVX version

Also modify the required alignment, to 32 instead of 16
for several codecs

Signed-off-by: James Almer <jamrial@gmail.com>
pull/273/head
Martin Vignali 7 years ago committed by James Almer
parent 4590d073cc
commit cbbec68847
  1. 2
      libavcodec/asv.h
  2. 4
      libavcodec/bink.c
  3. 2
      libavcodec/dnxhdenc.h
  4. 2
      libavcodec/eamad.c
  5. 2
      libavcodec/eatqi.c
  6. 2
      libavcodec/g2meet.c
  7. 2
      libavcodec/ituh263dec.c
  8. 2
      libavcodec/mdec.c
  9. 2
      libavcodec/mimic.c
  10. 2
      libavcodec/mjpegdec.h
  11. 6
      libavcodec/proresdec2.c
  12. 2
      libavcodec/speedhq.c
  13. 2
      libavcodec/wmv2.h
  14. 14
      libavcodec/x86/blockdsp.asm
  15. 6
      libavcodec/x86/blockdsp_init.c
  16. 4
      tests/checkasm/blockdsp.c

@ -54,7 +54,7 @@ typedef struct ASV1Context {
int mb_height; int mb_height;
int mb_width2; int mb_width2;
int mb_height2; int mb_height2;
DECLARE_ALIGNED(16, int16_t, block)[6][64]; DECLARE_ALIGNED(32, int16_t, block)[6][64];
uint16_t intra_matrix[64]; uint16_t intra_matrix[64];
int q_intra_matrix[64]; int q_intra_matrix[64];
uint8_t *bitstream_buffer; uint8_t *bitstream_buffer;

@ -813,7 +813,7 @@ static int binkb_decode_plane(BinkContext *c, AVFrame *frame, GetBitContext *gb,
int v, col[2]; int v, col[2];
const uint8_t *scan; const uint8_t *scan;
int xoff, yoff; int xoff, yoff;
LOCAL_ALIGNED_16(int16_t, block, [64]); LOCAL_ALIGNED_32(int16_t, block, [64]);
LOCAL_ALIGNED_16(int32_t, dctblock, [64]); LOCAL_ALIGNED_16(int32_t, dctblock, [64]);
int coordmap[64]; int coordmap[64];
int ybias = is_key ? -15 : 0; int ybias = is_key ? -15 : 0;
@ -976,7 +976,7 @@ static int bink_decode_plane(BinkContext *c, AVFrame *frame, GetBitContext *gb,
uint8_t *dst, *prev, *ref_start, *ref_end; uint8_t *dst, *prev, *ref_start, *ref_end;
int v, col[2]; int v, col[2];
const uint8_t *scan; const uint8_t *scan;
LOCAL_ALIGNED_16(int16_t, block, [64]); LOCAL_ALIGNED_32(int16_t, block, [64]);
LOCAL_ALIGNED_16(uint8_t, ublock, [64]); LOCAL_ALIGNED_16(uint8_t, ublock, [64]);
LOCAL_ALIGNED_16(int32_t, dctblock, [64]); LOCAL_ALIGNED_16(int32_t, dctblock, [64]);
int coordmap[64]; int coordmap[64];

@ -74,7 +74,7 @@ typedef struct DNXHDEncContext {
unsigned min_padding; unsigned min_padding;
int intra_quant_bias; int intra_quant_bias;
DECLARE_ALIGNED(16, int16_t, blocks)[12][64]; DECLARE_ALIGNED(32, int16_t, blocks)[12][64];
DECLARE_ALIGNED(16, uint8_t, edge_buf_y)[512]; // has to hold 16x16 uint16 when depth=10 DECLARE_ALIGNED(16, uint8_t, edge_buf_y)[512]; // has to hold 16x16 uint16 when depth=10
DECLARE_ALIGNED(16, uint8_t, edge_buf_uv)[2][512]; // has to hold 16x16 uint16_t when depth=10 DECLARE_ALIGNED(16, uint8_t, edge_buf_uv)[2][512]; // has to hold 16x16 uint16_t when depth=10

@ -54,7 +54,7 @@ typedef struct MadContext {
GetBitContext gb; GetBitContext gb;
void *bitstream_buf; void *bitstream_buf;
unsigned int bitstream_buf_size; unsigned int bitstream_buf_size;
DECLARE_ALIGNED(16, int16_t, block)[64]; DECLARE_ALIGNED(32, int16_t, block)[64];
ScanTable scantable; ScanTable scantable;
uint16_t quant_matrix[64]; uint16_t quant_matrix[64];
int mb_x; int mb_x;

@ -51,7 +51,7 @@ typedef struct TqiContext {
uint16_t intra_matrix[64]; uint16_t intra_matrix[64];
int last_dc[3]; int last_dc[3];
DECLARE_ALIGNED(16, int16_t, block)[6][64]; DECLARE_ALIGNED(32, int16_t, block)[6][64];
} TqiContext; } TqiContext;
static av_cold int tqi_decode_init(AVCodecContext *avctx) static av_cold int tqi_decode_init(AVCodecContext *avctx)

@ -122,7 +122,7 @@ typedef struct JPGContext {
VLC dc_vlc[2], ac_vlc[2]; VLC dc_vlc[2], ac_vlc[2];
int prev_dc[3]; int prev_dc[3];
DECLARE_ALIGNED(16, int16_t, block)[6][64]; DECLARE_ALIGNED(32, int16_t, block)[6][64];
uint8_t *buf; uint8_t *buf;
} JPGContext; } JPGContext;

@ -574,7 +574,7 @@ not_coded:
static int h263_skip_b_part(MpegEncContext *s, int cbp) static int h263_skip_b_part(MpegEncContext *s, int cbp)
{ {
LOCAL_ALIGNED_16(int16_t, dblock, [64]); LOCAL_ALIGNED_32(int16_t, dblock, [64]);
int i, mbi; int i, mbi;
int bli[6]; int bli[6];

@ -48,7 +48,7 @@ typedef struct MDECContext {
int mb_width; int mb_width;
int mb_height; int mb_height;
int mb_x, mb_y; int mb_x, mb_y;
DECLARE_ALIGNED(16, int16_t, block)[6][64]; DECLARE_ALIGNED(32, int16_t, block)[6][64];
DECLARE_ALIGNED(16, uint16_t, quant_matrix)[64]; DECLARE_ALIGNED(16, uint16_t, quant_matrix)[64];
uint8_t *bitstream_buffer; uint8_t *bitstream_buffer;
unsigned int bitstream_buffer_size; unsigned int bitstream_buffer_size;

@ -49,7 +49,7 @@ typedef struct MimicContext {
ThreadFrame frames [16]; ThreadFrame frames [16];
DECLARE_ALIGNED(16, int16_t, dct_block)[64]; DECLARE_ALIGNED(32, int16_t, dct_block)[64];
GetBitContext gb; GetBitContext gb;
ScanTable scantable; ScanTable scantable;

@ -98,7 +98,7 @@ typedef struct MJpegDecodeContext {
int got_picture; ///< we found a SOF and picture is valid, too. int got_picture; ///< we found a SOF and picture is valid, too.
int linesize[MAX_COMPONENTS]; ///< linesize << interlaced int linesize[MAX_COMPONENTS]; ///< linesize << interlaced
int8_t *qscale_table; int8_t *qscale_table;
DECLARE_ALIGNED(16, int16_t, block)[64]; DECLARE_ALIGNED(32, int16_t, block)[64];
int16_t (*blocks[MAX_COMPONENTS])[64]; ///< intermediate sums (progressive mode) int16_t (*blocks[MAX_COMPONENTS])[64]; ///< intermediate sums (progressive mode)
uint8_t *last_nnz[MAX_COMPONENTS]; uint8_t *last_nnz[MAX_COMPONENTS];
uint64_t coefs_finished[MAX_COMPONENTS]; ///< bitmask of which coefs have been completely decoded (progressive mode) uint64_t coefs_finished[MAX_COMPONENTS]; ///< bitmask of which coefs have been completely decoded (progressive mode)

@ -368,7 +368,7 @@ static int decode_slice_luma(AVCodecContext *avctx, SliceContext *slice,
const int16_t *qmat) const int16_t *qmat)
{ {
ProresContext *ctx = avctx->priv_data; ProresContext *ctx = avctx->priv_data;
LOCAL_ALIGNED_16(int16_t, blocks, [8*4*64]); LOCAL_ALIGNED_32(int16_t, blocks, [8*4*64]);
int16_t *block; int16_t *block;
GetBitContext gb; GetBitContext gb;
int i, blocks_per_slice = slice->mb_count<<2; int i, blocks_per_slice = slice->mb_count<<2;
@ -402,7 +402,7 @@ static int decode_slice_chroma(AVCodecContext *avctx, SliceContext *slice,
const int16_t *qmat, int log2_blocks_per_mb) const int16_t *qmat, int log2_blocks_per_mb)
{ {
ProresContext *ctx = avctx->priv_data; ProresContext *ctx = avctx->priv_data;
LOCAL_ALIGNED_16(int16_t, blocks, [8*4*64]); LOCAL_ALIGNED_32(int16_t, blocks, [8*4*64]);
int16_t *block; int16_t *block;
GetBitContext gb; GetBitContext gb;
int i, j, blocks_per_slice = slice->mb_count << log2_blocks_per_mb; int i, j, blocks_per_slice = slice->mb_count << log2_blocks_per_mb;
@ -485,7 +485,7 @@ static void decode_slice_alpha(ProresContext *ctx,
{ {
GetBitContext gb; GetBitContext gb;
int i; int i;
LOCAL_ALIGNED_16(int16_t, blocks, [8*4*64]); LOCAL_ALIGNED_32(int16_t, blocks, [8*4*64]);
int16_t *block; int16_t *block;
for (i = 0; i < blocks_per_slice<<2; i++) for (i = 0; i < blocks_per_slice<<2; i++)

@ -224,7 +224,7 @@ static inline int decode_dct_block(const SHQContext *s, GetBitContext *gb, int l
{ {
const int *quant_matrix = s->quant_matrix; const int *quant_matrix = s->quant_matrix;
const uint8_t *scantable = s->intra_scantable.permutated; const uint8_t *scantable = s->intra_scantable.permutated;
LOCAL_ALIGNED_16(int16_t, block, [64]); LOCAL_ALIGNED_32(int16_t, block, [64]);
int dc_offset; int dc_offset;
s->bdsp.clear_block(block); s->bdsp.clear_block(block);

@ -51,7 +51,7 @@ typedef struct Wmv2Context {
int hshift; int hshift;
ScanTable abt_scantable[2]; ScanTable abt_scantable[2];
DECLARE_ALIGNED(16, int16_t, abt_block2)[6][64]; DECLARE_ALIGNED(32, int16_t, abt_block2)[6][64];
} Wmv2Context; } Wmv2Context;
void ff_wmv2_common_init(Wmv2Context *w); void ff_wmv2_common_init(Wmv2Context *w);

@ -4,6 +4,8 @@
;* Copyright (c) 2008 Loren Merritt ;* Copyright (c) 2008 Loren Merritt
;* Copyright (c) 2009 Fiona Glaser ;* Copyright (c) 2009 Fiona Glaser
;* ;*
;* AVX version by Jokyo Images
;*
;* This file is part of FFmpeg. ;* This file is part of FFmpeg.
;* ;*
;* FFmpeg is free software; you can redistribute it and/or ;* FFmpeg is free software; you can redistribute it and/or
@ -39,20 +41,18 @@ cglobal clear_block, 1, 1, %1, blocks
mova [blocksq+mmsize*(1+%%i)], m0 mova [blocksq+mmsize*(1+%%i)], m0
mova [blocksq+mmsize*(2+%%i)], m0 mova [blocksq+mmsize*(2+%%i)], m0
mova [blocksq+mmsize*(3+%%i)], m0 mova [blocksq+mmsize*(3+%%i)], m0
mova [blocksq+mmsize*(4+%%i)], m0 %assign %%i %%i+4
mova [blocksq+mmsize*(5+%%i)], m0
mova [blocksq+mmsize*(6+%%i)], m0
mova [blocksq+mmsize*(7+%%i)], m0
%assign %%i %%i+8
%endrep %endrep
RET RET
%endmacro %endmacro
INIT_MMX mmx INIT_MMX mmx
%define ZERO pxor %define ZERO pxor
CLEAR_BLOCK 0, 2 CLEAR_BLOCK 0, 4
INIT_XMM sse INIT_XMM sse
%define ZERO xorps %define ZERO xorps
CLEAR_BLOCK 1, 2
INIT_YMM avx
CLEAR_BLOCK 1, 1 CLEAR_BLOCK 1, 1
;----------------------------------------- ;-----------------------------------------
@ -84,3 +84,5 @@ CLEAR_BLOCKS 0
INIT_XMM sse INIT_XMM sse
%define ZERO xorps %define ZERO xorps
CLEAR_BLOCKS 1 CLEAR_BLOCKS 1
INIT_YMM avx
CLEAR_BLOCKS 1

@ -28,8 +28,10 @@
void ff_clear_block_mmx(int16_t *block); void ff_clear_block_mmx(int16_t *block);
void ff_clear_block_sse(int16_t *block); void ff_clear_block_sse(int16_t *block);
void ff_clear_block_avx(int16_t *block);
void ff_clear_blocks_mmx(int16_t *blocks); void ff_clear_blocks_mmx(int16_t *blocks);
void ff_clear_blocks_sse(int16_t *blocks); void ff_clear_blocks_sse(int16_t *blocks);
void ff_clear_blocks_avx(int16_t *blocks);
av_cold void ff_blockdsp_init_x86(BlockDSPContext *c, av_cold void ff_blockdsp_init_x86(BlockDSPContext *c,
AVCodecContext *avctx) AVCodecContext *avctx)
@ -50,5 +52,9 @@ av_cold void ff_blockdsp_init_x86(BlockDSPContext *c,
c->clear_block = ff_clear_block_sse; c->clear_block = ff_clear_block_sse;
c->clear_blocks = ff_clear_blocks_sse; c->clear_blocks = ff_clear_blocks_sse;
} }
if (EXTERNAL_AVX_FAST(cpu_flags)) {
c->clear_block = ff_clear_block_avx;
c->clear_blocks = ff_clear_blocks_avx;
}
#endif /* HAVE_X86ASM */ #endif /* HAVE_X86ASM */
} }

@ -53,8 +53,8 @@ do { \
void checkasm_check_blockdsp(void) void checkasm_check_blockdsp(void)
{ {
LOCAL_ALIGNED_16(uint16_t, buf0, [6 * 8 * 8]); LOCAL_ALIGNED_32(uint16_t, buf0, [6 * 8 * 8]);
LOCAL_ALIGNED_16(uint16_t, buf1, [6 * 8 * 8]); LOCAL_ALIGNED_32(uint16_t, buf1, [6 * 8 * 8]);
AVCodecContext avctx = { 0 }; AVCodecContext avctx = { 0 };
BlockDSPContext h; BlockDSPContext h;

Loading…
Cancel
Save