From 6d5636ad9ab6bd9bedf902051d88b7044385f88b Mon Sep 17 00:00:00 2001 From: Pierre Edouard Lepere Date: Mon, 18 Aug 2014 10:01:09 +0200 Subject: [PATCH] hevc: x86: Add add_residual() SIMD optimizations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Initially written by Pierre Edouard Lepere , extended by James Almer . Signed-off-by: Alexandra Hájková --- libavcodec/hevcdsp.h | 2 +- libavcodec/x86/Makefile | 7 +- libavcodec/x86/hevc_add_res.asm | 369 ++++++++++++++++++++++++++++++++ libavcodec/x86/hevcdsp_init.c | 42 ++++ 4 files changed, 416 insertions(+), 4 deletions(-) create mode 100644 libavcodec/x86/hevc_add_res.asm diff --git a/libavcodec/hevcdsp.h b/libavcodec/hevcdsp.h index 199e5a9064..49cb7110d5 100644 --- a/libavcodec/hevcdsp.h +++ b/libavcodec/hevcdsp.h @@ -42,7 +42,7 @@ typedef struct HEVCDSPContext { void (*put_pcm)(uint8_t *dst, ptrdiff_t stride, int size, GetBitContext *gb, int pcm_bit_depth); - void (*add_residual[4])(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride); + void (*add_residual[4])(uint8_t *dst, int16_t *res, ptrdiff_t stride); void (*dequant)(int16_t *coeffs); void (*transform_4x4_luma)(int16_t *coeffs); diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile index a38535b98f..094c1fa517 100644 --- a/libavcodec/x86/Makefile +++ b/libavcodec/x86/Makefile @@ -115,9 +115,10 @@ YASM-OBJS-$(CONFIG_AAC_DECODER) += x86/sbrdsp.o YASM-OBJS-$(CONFIG_APE_DECODER) += x86/apedsp.o YASM-OBJS-$(CONFIG_DCA_DECODER) += x86/dcadsp.o YASM-OBJS-$(CONFIG_DNXHD_ENCODER) += x86/dnxhdenc.o -YASM-OBJS-$(CONFIG_HEVC_DECODER) += x86/hevc_deblock.o \ - x86/hevc_mc.o \ - x86/hevc_idct.o +YASM-OBJS-$(CONFIG_HEVC_DECODER) += x86/hevc_add_res.o \ + x86/hevc_deblock.o \ + x86/hevc_idct.o \ + x86/hevc_mc.o YASM-OBJS-$(CONFIG_PNG_DECODER) += x86/pngdsp.o YASM-OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp.o YASM-OBJS-$(CONFIG_RV40_DECODER) += x86/rv40dsp.o diff --git a/libavcodec/x86/hevc_add_res.asm b/libavcodec/x86/hevc_add_res.asm new file mode 100644 index 0000000000..66b929c594 --- /dev/null +++ b/libavcodec/x86/hevc_add_res.asm @@ -0,0 +1,369 @@ +; ***************************************************************************** +; * Provide SIMD optimizations for add_residual functions for HEVC decoding +; * Copyright (c) 2014 Pierre-Edouard LEPERE +; * +; * This file is part of Libav. +; * +; * Libav is free software; you can redistribute it and/or +; * modify it under the terms of the GNU Lesser General Public +; * License as published by the Free Software Foundation; either +; * version 2.1 of the License, or (at your option) any later version. +; * +; * Libav is distributed in the hope that it will be useful, +; * but WITHOUT ANY WARRANTY; without even the implied warranty of +; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +; * Lesser General Public License for more details. +; * +; * You should have received a copy of the GNU Lesser General Public +; * License along with Libav; if not, write to the Free Software +; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +; ****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION_RODATA 32 +max_pixels_10: times 16 dw ((1 << 10)-1) + +SECTION .text + +; the add_res macros and functions were largely inspired by h264_idct.asm from the x264 project +%macro ADD_RES_MMX_4_8 0 + mova m0, [r1] + mova m2, [r1+8] + pxor m1, m1 + pxor m3, m3 + psubw m1, m0 + psubw m3, m2 + packuswb m0, m2 + packuswb m1, m3 + + movd m2, [r0] + movd m3, [r0+r2] + punpckldq m2, m3 + paddusb m0, m2 + psubusb m0, m1 + movd [r0], m0 + psrlq m0, 32 + movd [r0+r2], m0 +%endmacro + + +INIT_MMX mmxext +; void ff_hevc_add_residual_4_8_mmxext(uint8_t *dst, int16_t *res, ptrdiff_t stride) +cglobal hevc_add_residual_4_8, 3, 3, 6 + ADD_RES_MMX_4_8 + add r1, 16 + lea r0, [r0+r2*2] + ADD_RES_MMX_4_8 + RET + +%macro ADD_RES_SSE_8_8 0 + pxor m3, m3 + mova m4, [r1] + mova m6, [r1+16] + mova m0, [r1+32] + mova m2, [r1+48] + psubw m5, m3, m4 + psubw m7, m3, m6 + psubw m1, m3, m0 + packuswb m4, m0 + packuswb m5, m1 + psubw m3, m2 + packuswb m6, m2 + packuswb m7, m3 + + movq m0, [r0] + movq m1, [r0+r2] + movhps m0, [r0+r2*2] + movhps m1, [r0+r3] + paddusb m0, m4 + paddusb m1, m6 + psubusb m0, m5 + psubusb m1, m7 + movq [r0], m0 + movq [r0+r2], m1 + movhps [r0+2*r2], m0 + movhps [r0+r3], m1 +%endmacro + +%macro ADD_RES_SSE_16_32_8 3 + mova xm2, [r1+%1] + mova xm6, [r1+%1+16] +%if cpuflag(avx2) + vinserti128 m2, m2, [r1+%1+32], 1 + vinserti128 m6, m6, [r1+%1+48], 1 +%endif + psubw m1, m0, m2 + psubw m5, m0, m6 + packuswb m2, m6 + packuswb m1, m5 + + mova xm4, [r1+%1+mmsize*2] + mova xm6, [r1+%1+mmsize*2+16] +%if cpuflag(avx2) + vinserti128 m4, m4, [r1+%1+96 ], 1 + vinserti128 m6, m6, [r1+%1+112], 1 +%endif + psubw m3, m0, m4 + psubw m5, m0, m6 + packuswb m4, m6 + packuswb m3, m5 + + paddusb m2, [%2] + paddusb m4, [%3] + psubusb m2, m1 + psubusb m4, m3 + mova [%2], m2 + mova [%3], m4 +%endmacro + + +%macro TRANSFORM_ADD_8 0 +; void ff_hevc_add_residual_8_8_(uint8_t *dst, int16_t *res, ptrdiff_t stride) +cglobal hevc_add_residual_8_8, 3, 4, 8 + lea r3, [r2*3] + ADD_RES_SSE_8_8 + add r1, 64 + lea r0, [r0+r2*4] + ADD_RES_SSE_8_8 + RET + +; void ff_hevc_add_residual_16_8_(uint8_t *dst, int16_t *res, ptrdiff_t stride) +cglobal hevc_add_residual_16_8, 3, 5, 7 + pxor m0, m0 + lea r3, [r2*3] + mov r4d, 4 +.loop: + ADD_RES_SSE_16_32_8 0, r0, r0+r2 + ADD_RES_SSE_16_32_8 64, r0+r2*2, r0+r3 + add r1, 128 + lea r0, [r0+r2*4] + dec r4d + jg .loop + RET + +; void ff_hevc_add_residual_32_8_(uint8_t *dst, int16_t *res, ptrdiff_t stride) +cglobal hevc_add_residual_32_8, 3, 5, 7 + pxor m0, m0 + mov r4d, 16 +.loop: + ADD_RES_SSE_16_32_8 0, r0, r0+16 + ADD_RES_SSE_16_32_8 64, r0+r2, r0+r2+16 + add r1, 128 + lea r0, [r0+r2*2] + dec r4d + jg .loop + RET +%endmacro + +INIT_XMM sse2 +TRANSFORM_ADD_8 +INIT_XMM avx +TRANSFORM_ADD_8 + +%if HAVE_AVX2_EXTERNAL +INIT_YMM avx2 +; void ff_hevc_add_residual_32_8_avx2(uint8_t *dst, int16_t *res, ptrdiff_t stride) +cglobal hevc_add_residual_32_8, 3, 5, 7 + pxor m0, m0 + lea r3, [r2*3] + mov r4d, 8 +.loop: + ADD_RES_SSE_16_32_8 0, r0, r0+r2 + ADD_RES_SSE_16_32_8 128, r0+r2*2, r0+r3 + add r1, 256 + lea r0, [r0+r2*4] + dec r4d + jg .loop + RET +%endif ;HAVE_AVX2_EXTERNAL + +%macro ADD_RES_SSE_8_10 4 + mova m0, [%4] + mova m1, [%4+16] + mova m2, [%4+32] + mova m3, [%4+48] + paddw m0, [%1+0] + paddw m1, [%1+%2] + paddw m2, [%1+%2*2] + paddw m3, [%1+%3] + CLIPW m0, m4, m5 + CLIPW m1, m4, m5 + CLIPW m2, m4, m5 + CLIPW m3, m4, m5 + mova [%1+0], m0 + mova [%1+%2], m1 + mova [%1+%2*2], m2 + mova [%1+%3], m3 +%endmacro + +%macro ADD_RES_MMX_4_10 3 + mova m0, [%1+0] + mova m1, [%1+%2] + paddw m0, [%3] + paddw m1, [%3+8] + CLIPW m0, m2, m3 + CLIPW m1, m2, m3 + mova [%1+0], m0 + mova [%1+%2], m1 +%endmacro + +%macro ADD_RES_SSE_16_10 3 + mova m0, [%3] + mova m1, [%3+16] + mova m2, [%3+32] + mova m3, [%3+48] + paddw m0, [%1] + paddw m1, [%1+16] + paddw m2, [%1+%2] + paddw m3, [%1+%2+16] + CLIPW m0, m4, m5 + CLIPW m1, m4, m5 + CLIPW m2, m4, m5 + CLIPW m3, m4, m5 + mova [%1], m0 + mova [%1+16], m1 + mova [%1+%2], m2 + mova [%1+%2+16], m3 +%endmacro + +%macro ADD_RES_SSE_32_10 2 + mova m0, [%2] + mova m1, [%2+16] + mova m2, [%2+32] + mova m3, [%2+48] + + paddw m0, [%1] + paddw m1, [%1+16] + paddw m2, [%1+32] + paddw m3, [%1+48] + CLIPW m0, m4, m5 + CLIPW m1, m4, m5 + CLIPW m2, m4, m5 + CLIPW m3, m4, m5 + mova [%1], m0 + mova [%1+16], m1 + mova [%1+32], m2 + mova [%1+48], m3 +%endmacro + +%macro ADD_RES_AVX2_16_10 4 + mova m0, [%4] + mova m1, [%4+32] + mova m2, [%4+64] + mova m3, [%4+96] + + paddw m0, [%1+0] + paddw m1, [%1+%2] + paddw m2, [%1+%2*2] + paddw m3, [%1+%3] + + CLIPW m0, m4, m5 + CLIPW m1, m4, m5 + CLIPW m2, m4, m5 + CLIPW m3, m4, m5 + mova [%1+0], m0 + mova [%1+%2], m1 + mova [%1+%2*2], m2 + mova [%1+%3], m3 +%endmacro + +%macro ADD_RES_AVX2_32_10 3 + mova m0, [%3] + mova m1, [%3+32] + mova m2, [%3+64] + mova m3, [%3+96] + + paddw m0, [%1] + paddw m1, [%1+32] + paddw m2, [%1+%2] + paddw m3, [%1+%2+32] + + CLIPW m0, m4, m5 + CLIPW m1, m4, m5 + CLIPW m2, m4, m5 + CLIPW m3, m4, m5 + mova [%1], m0 + mova [%1+32], m1 + mova [%1+%2], m2 + mova [%1+%2+32], m3 +%endmacro + +; void ff_hevc_add_residual_<4|8|16|32>_10(pixel *dst, int16_t *block, ptrdiff_t stride) +INIT_MMX mmxext +cglobal hevc_add_residual_4_10, 3, 3, 6 + pxor m2, m2 + mova m3, [max_pixels_10] + ADD_RES_MMX_4_10 r0, r2, r1 + add r1, 16 + lea r0, [r0+2*r2] + ADD_RES_MMX_4_10 r0, r2, r1 + RET + +INIT_XMM sse2 +cglobal hevc_add_residual_8_10, 3, 4, 6 + pxor m4, m4 + mova m5, [max_pixels_10] + lea r3, [r2*3] + + ADD_RES_SSE_8_10 r0, r2, r3, r1 + lea r0, [r0+r2*4] + add r1, 64 + ADD_RES_SSE_8_10 r0, r2, r3, r1 + RET + +cglobal hevc_add_residual_16_10, 3, 5, 6 + pxor m4, m4 + mova m5, [max_pixels_10] + + mov r4d, 8 +.loop: + ADD_RES_SSE_16_10 r0, r2, r1 + lea r0, [r0+r2*2] + add r1, 64 + dec r4d + jg .loop + RET + +cglobal hevc_add_residual_32_10, 3, 5, 6 + pxor m4, m4 + mova m5, [max_pixels_10] + + mov r4d, 32 +.loop + ADD_RES_SSE_32_10 r0, r1 + lea r0, [r0+r2] + add r1, 64 + dec r4d + jg .loop + RET + +%if HAVE_AVX2_EXTERNAL +INIT_YMM avx2 +cglobal hevc_add_residual_16_10, 3, 5, 6 + pxor m4, m4 + mova m5, [max_pixels_10] + lea r3, [r2*3] + + mov r4d, 4 +.loop + ADD_RES_AVX2_16_10 r0, r2, r3, r1 + lea r0, [r0+r2*4] + add r1, 128 + dec r4d + jg .loop + RET + +cglobal hevc_add_residual_32_10, 3, 5, 6 + pxor m4, m4 + mova m5, [max_pixels_10] + + mov r4d, 16 +.loop + ADD_RES_AVX2_32_10 r0, r2, r1 + lea r0, [r0+r2*2] + add r1, 128 + dec r4d + jg .loop + RET +%endif ;HAVE_AVX2_EXTERNAL diff --git a/libavcodec/x86/hevcdsp_init.c b/libavcodec/x86/hevcdsp_init.c index 0a06347903..a95fa30a95 100644 --- a/libavcodec/x86/hevcdsp_init.c +++ b/libavcodec/x86/hevcdsp_init.c @@ -91,6 +91,25 @@ void ff_hevc_idct_32x32_10_ ## opt(int16_t *coeffs, int col_limit); IDCT_FUNCS(sse2) IDCT_FUNCS(avx) +void ff_hevc_add_residual_4_8_mmxext(uint8_t *dst, int16_t *res, ptrdiff_t stride); +void ff_hevc_add_residual_8_8_sse2(uint8_t *dst, int16_t *res, ptrdiff_t stride); +void ff_hevc_add_residual_16_8_sse2(uint8_t *dst, int16_t *res, ptrdiff_t stride); +void ff_hevc_add_residual_32_8_sse2(uint8_t *dst, int16_t *res, ptrdiff_t stride); + +void ff_hevc_add_residual_8_8_avx(uint8_t *dst, int16_t *res, ptrdiff_t stride); +void ff_hevc_add_residual_16_8_avx(uint8_t *dst, int16_t *res, ptrdiff_t stride); +void ff_hevc_add_residual_32_8_avx(uint8_t *dst, int16_t *res, ptrdiff_t stride); + +void ff_hevc_add_residual_32_8_avx2(uint8_t *dst, int16_t *res, ptrdiff_t stride); + +void ff_hevc_add_residual_4_10_mmxext(uint8_t *dst, int16_t *res, ptrdiff_t stride); +void ff_hevc_add_residual_8_10_sse2(uint8_t *dst, int16_t *res, ptrdiff_t stride); +void ff_hevc_add_residual_16_10_sse2(uint8_t *dst, int16_t *res, ptrdiff_t stride); +void ff_hevc_add_residual_32_10_sse2(uint8_t *dst, int16_t *res, ptrdiff_t stride); + +void ff_hevc_add_residual_16_10_avx2(uint8_t *dst, int16_t *res, ptrdiff_t stride); +void ff_hevc_add_residual_32_10_avx2(uint8_t *dst, int16_t *res, ptrdiff_t stride); + #define GET_PIXELS(width, depth, cf) \ void ff_hevc_get_pixels_ ## width ## _ ## depth ## _ ## cf(int16_t *dst, ptrdiff_t dststride, \ uint8_t *src, ptrdiff_t srcstride, \ @@ -278,17 +297,24 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth) if (EXTERNAL_MMXEXT(cpu_flags)) { c->idct_dc[0] = ff_hevc_idct_4x4_dc_8_mmxext; c->idct_dc[1] = ff_hevc_idct_8x8_dc_8_mmxext; + + c->add_residual[0] = ff_hevc_add_residual_4_8_mmxext; } if (EXTERNAL_SSE2(cpu_flags)) { c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_8_sse2; c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_8_sse2; + c->add_residual[1] = ff_hevc_add_residual_8_8_sse2; + c->add_residual[2] = ff_hevc_add_residual_16_8_sse2; + c->add_residual[3] = ff_hevc_add_residual_32_8_sse2; + c->idct_dc[1] = ff_hevc_idct_8x8_dc_8_sse2; c->idct_dc[2] = ff_hevc_idct_16x16_dc_8_sse2; c->idct_dc[3] = ff_hevc_idct_32x32_dc_8_sse2; c->idct[0] = ff_hevc_idct_4x4_8_sse2; c->idct[1] = ff_hevc_idct_8x8_8_sse2; + SET_QPEL_FUNCS(0, 0, 8, sse2, ff_hevc_get_pixels); SET_EPEL_FUNCS(0, 0, 8, sse2, ff_hevc_get_pixels); @@ -307,11 +333,19 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth) if (EXTERNAL_AVX(cpu_flags)) { c->idct[0] = ff_hevc_idct_4x4_8_avx; c->idct[1] = ff_hevc_idct_8x8_8_avx; + c->add_residual[1] = ff_hevc_add_residual_8_8_avx; + c->add_residual[2] = ff_hevc_add_residual_16_8_avx; + c->add_residual[3] = ff_hevc_add_residual_32_8_avx; + } + if (EXTERNAL_AVX2(cpu_flags)) { + c->add_residual[3] = ff_hevc_add_residual_32_8_avx2; } } else if (bit_depth == 10) { if (EXTERNAL_MMXEXT(cpu_flags)) { c->idct_dc[0] = ff_hevc_idct_4x4_dc_10_mmxext; c->idct_dc[1] = ff_hevc_idct_8x8_dc_10_mmxext; + + c->add_residual[0] = ff_hevc_add_residual_4_10_mmxext; } if (EXTERNAL_SSE2(cpu_flags)) { c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_10_sse2; @@ -330,11 +364,19 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth) SET_LUMA_FUNCS(put_unweighted_pred_avg, ff_hevc_put_unweighted_pred_avg, 10, sse2); SET_CHROMA_FUNCS(put_unweighted_pred_chroma, ff_hevc_put_unweighted_pred, 10, sse2); SET_CHROMA_FUNCS(put_unweighted_pred_avg_chroma, ff_hevc_put_unweighted_pred_avg, 10, sse2); + + c->add_residual[1] = ff_hevc_add_residual_8_10_sse2; + c->add_residual[2] = ff_hevc_add_residual_16_10_sse2; + c->add_residual[3] = ff_hevc_add_residual_32_10_sse2; } if (EXTERNAL_AVX(cpu_flags)) { c->idct[0] = ff_hevc_idct_4x4_10_avx; c->idct[1] = ff_hevc_idct_8x8_10_avx; } + if (EXTERNAL_AVX2(cpu_flags)) { + c->add_residual[2] = ff_hevc_add_residual_16_10_avx2; + c->add_residual[3] = ff_hevc_add_residual_32_10_avx2; + } } #if ARCH_X86_64