From 73c4f63ba5a36f3998159dcd5a4a2ec7500eb557 Mon Sep 17 00:00:00 2001 From: James Almer Date: Tue, 29 Jul 2014 04:30:13 -0300 Subject: [PATCH] x86/hevc_deblock: add add ff_hevc_[hv]_loop_filter_luma_{8, 10, 12}_avx ~5% faster than SSSE3 Signed-off-by: James Almer Signed-off-by: Michael Niedermayer --- libavcodec/x86/hevc_deblock.asm | 10 +++++++++- libavcodec/x86/hevcdsp_init.c | 30 ++++++++++++++++++++++++++++++ 2 files changed, 39 insertions(+), 1 deletion(-) diff --git a/libavcodec/x86/hevc_deblock.asm b/libavcodec/x86/hevc_deblock.asm index d6e8806f87..2d4353a549 100644 --- a/libavcodec/x86/hevc_deblock.asm +++ b/libavcodec/x86/hevc_deblock.asm @@ -663,11 +663,11 @@ ALIGN 16 MASKED_COPY m4, m8 %endmacro -INIT_XMM sse2 ;----------------------------------------------------------------------------- ; void ff_hevc_v_loop_filter_chroma(uint8_t *_pix, ptrdiff_t _stride, int *_tc, ; uint8_t *_no_p, uint8_t *_no_q); ;----------------------------------------------------------------------------- +%macro LOOP_FILTER_CHROMA 0 cglobal hevc_v_loop_filter_chroma_8, 3, 5, 7, pix, stride, tc, pix0, r3stride sub pixq, 2 lea r3strideq, [3*strideq] @@ -752,6 +752,12 @@ cglobal hevc_h_loop_filter_chroma_12, 3, 4, 7, pix, stride, tc, pix0 movu [pix0q+strideq], m1 movu [pixq], m2 RET +%endmacro + +INIT_XMM sse2 +LOOP_FILTER_CHROMA +INIT_XMM avx +LOOP_FILTER_CHROMA %if ARCH_X86_64 %macro LOOP_FILTER_LUMA 0 @@ -903,4 +909,6 @@ INIT_XMM sse2 LOOP_FILTER_LUMA INIT_XMM ssse3 LOOP_FILTER_LUMA +INIT_XMM avx +LOOP_FILTER_LUMA %endif diff --git a/libavcodec/x86/hevcdsp_init.c b/libavcodec/x86/hevcdsp_init.c index 3e8704aec8..828c081a2e 100644 --- a/libavcodec/x86/hevcdsp_init.c +++ b/libavcodec/x86/hevcdsp_init.c @@ -45,12 +45,18 @@ void ff_hevc_ ## DIR ## _loop_filter_luma_ ## DEPTH ## _ ## OPT(uint8_t *pix, pt LFC_FUNCS(uint8_t, 8, sse2) LFC_FUNCS(uint8_t, 10, sse2) LFC_FUNCS(uint8_t, 12, sse2) +LFC_FUNCS(uint8_t, 8, avx) +LFC_FUNCS(uint8_t, 10, avx) +LFC_FUNCS(uint8_t, 12, avx) LFL_FUNCS(uint8_t, 8, sse2) LFL_FUNCS(uint8_t, 10, sse2) LFL_FUNCS(uint8_t, 12, sse2) LFL_FUNCS(uint8_t, 8, ssse3) LFL_FUNCS(uint8_t, 10, ssse3) LFL_FUNCS(uint8_t, 12, ssse3) +LFL_FUNCS(uint8_t, 8, avx) +LFL_FUNCS(uint8_t, 10, avx) +LFL_FUNCS(uint8_t, 12, avx) #define IDCT_FUNCS(W, opt) \ void ff_hevc_idct##W##_dc_8_##opt(int16_t *coeffs); \ @@ -492,6 +498,14 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth) QPEL_LINKS(c->put_hevc_qpel, 1, 0, qpel_v, 8, sse4); QPEL_LINKS(c->put_hevc_qpel, 1, 1, qpel_hv, 8, sse4); } + if (EXTERNAL_AVX(cpu_flags)) { + c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_8_avx; + c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_8_avx; + if (ARCH_X86_64) { + c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_8_avx; + c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_8_avx; + } + } if (EXTERNAL_AVX2(cpu_flags)) { c->idct_dc[2] = ff_hevc_idct16x16_dc_8_avx2; c->idct_dc[3] = ff_hevc_idct32x32_dc_8_avx2; @@ -528,6 +542,14 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth) QPEL_LINKS(c->put_hevc_qpel, 1, 0, qpel_v, 10, sse4); QPEL_LINKS(c->put_hevc_qpel, 1, 1, qpel_hv, 10, sse4); } + if (EXTERNAL_AVX(cpu_flags)) { + c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_10_avx; + c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_10_avx; + if (ARCH_X86_64) { + c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_10_avx; + c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_10_avx; + } + } if (EXTERNAL_AVX2(cpu_flags)) { c->idct_dc[2] = ff_hevc_idct16x16_dc_10_avx2; c->idct_dc[3] = ff_hevc_idct32x32_dc_10_avx2; @@ -565,6 +587,14 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth) QPEL_LINKS(c->put_hevc_qpel, 1, 0, qpel_v, 12, sse4); QPEL_LINKS(c->put_hevc_qpel, 1, 1, qpel_hv, 12, sse4); } + if (EXTERNAL_AVX(cpu_flags)) { + c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_12_avx; + c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_12_avx; + if (ARCH_X86_64) { + c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_12_avx; + c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_12_avx; + } + } if (EXTERNAL_AVX2(cpu_flags)) { c->idct_dc[2] = ff_hevc_idct16x16_dc_12_avx2; c->idct_dc[3] = ff_hevc_idct32x32_dc_12_avx2;