From 2f77923d72c35f4a10b9bb1d1086d0edd7f43dde Mon Sep 17 00:00:00 2001 From: Loren Merritt Date: Sun, 18 Oct 2009 20:10:10 +0000 Subject: [PATCH] simd add_hfyu_left_prediction 2.2x faster than C on conroe, 3.6x on penryn. 4-6% faster huffyuv decoding if using left or plane mode and yuv Originally committed as revision 20287 to svn://svn.ffmpeg.org/ffmpeg/trunk --- libavcodec/dsputil.h | 2 +- libavcodec/x86/dsputil_mmx.c | 7 ++++ libavcodec/x86/dsputil_yasm.asm | 74 +++++++++++++++++++++++++++++++++ libavcodec/x86/x86inc.asm | 2 + 4 files changed, 84 insertions(+), 1 deletion(-) diff --git a/libavcodec/dsputil.h b/libavcodec/dsputil.h index 58524b26e4..ab791f53cb 100644 --- a/libavcodec/dsputil.h +++ b/libavcodec/dsputil.h @@ -349,7 +349,7 @@ typedef struct DSPContext { */ void (*sub_hfyu_median_prediction)(uint8_t *dst, const uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top); void (*add_hfyu_median_prediction)(uint8_t *dst, const uint8_t *top, uint8_t *diff, int w, int *left, int *left_top); - int (*add_hfyu_left_prediction)(uint8_t *dst, const uint8_t *src, int w, int acc); + int (*add_hfyu_left_prediction)(uint8_t *dst, const uint8_t *src, int w, int left); void (*add_hfyu_left_prediction_bgr32)(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue); /* this might write to dst[w] */ void (*add_png_paeth_prediction)(uint8_t *dst, uint8_t *src, uint8_t *top, int w, int bpp); diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c index 2e00aa2a24..3f6f1dae6b 100644 --- a/libavcodec/x86/dsputil_mmx.c +++ b/libavcodec/x86/dsputil_mmx.c @@ -2385,6 +2385,8 @@ void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len) void ff_float_to_int16_interleave6_3dnow(int16_t *dst, const float **src, int len); void ff_float_to_int16_interleave6_3dn2(int16_t *dst, const float **src, int len); void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, uint8_t *top, uint8_t *diff, int w, int *left, int *left_top); +int ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, uint8_t *src, int w, int left); +int ff_add_hfyu_left_prediction_sse4(uint8_t *dst, uint8_t *src, int w, int left); void ff_x264_deblock_v_luma_sse2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0); void ff_x264_deblock_h_luma_sse2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0); void ff_x264_deblock_v8_luma_intra_mmxext(uint8_t *pix, int stride, int alpha, int beta); @@ -2951,6 +2953,11 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_ssse3; c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_ssse3; c->add_png_paeth_prediction= add_png_paeth_prediction_ssse3; +#if HAVE_YASM + c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_ssse3; + if (mm_flags & FF_MM_SSE4) // not really sse4, just slow on Conroe + c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_sse4; +#endif } #endif diff --git a/libavcodec/x86/dsputil_yasm.asm b/libavcodec/x86/dsputil_yasm.asm index 9d5e62e57d..d071186f5c 100644 --- a/libavcodec/x86/dsputil_yasm.asm +++ b/libavcodec/x86/dsputil_yasm.asm @@ -21,6 +21,13 @@ %include "x86inc.asm" +SECTION_RODATA +pb_f: times 16 db 15 +pb_zzzzzzzz77777777: times 8 db -1 +pb_7: times 8 db 7 +pb_zzzz3333zzzzbbbb: db -1,-1,-1,-1,3,3,3,3,-1,-1,-1,-1,11,11,11,11 +pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13 + section .text align=16 %macro PSWAPD_SSE 2 @@ -150,3 +157,70 @@ cglobal add_hfyu_median_prediction_mmx2, 6,6,0, dst, top, diff, w, left, left_to movzx r2d, byte [topq-1] mov [left_topq], r2d RET + + +%macro ADD_HFYU_LEFT_LOOP 1 ; %1 = is_aligned + add srcq, wq + add dstq, wq + neg wq +%%.loop: + mova m1, [srcq+wq] + mova m2, m1 + psllw m1, 8 + paddb m1, m2 + mova m2, m1 + pshufb m1, m3 + paddb m1, m2 + pshufb m0, m5 + mova m2, m1 + pshufb m1, m4 + paddb m1, m2 +%if mmsize == 16 + mova m2, m1 + pshufb m1, m6 + paddb m1, m2 +%endif + paddb m0, m1 +%if %1 + mova [dstq+wq], m0 +%else + movq [dstq+wq], m0 + movhps [dstq+wq+8], m0 +%endif + add wq, mmsize + jl %%.loop + mov eax, mmsize-1 + sub eax, wd + movd m1, eax + pshufb m0, m1 + movd eax, m0 + RET +%endmacro + +; int ff_add_hfyu_left_prediction(uint8_t *dst, uint8_t *src, int w, int left) +INIT_MMX +cglobal add_hfyu_left_prediction_ssse3, 3,3,7, dst, src, w, left +.skip_prologue: + mova m5, [pb_7 GLOBAL] + mova m4, [pb_zzzz3333zzzzbbbb GLOBAL] + mova m3, [pb_zz11zz55zz99zzdd GLOBAL] + movd m0, leftm + psllq m0, 56 + ADD_HFYU_LEFT_LOOP 1 + +INIT_XMM +cglobal add_hfyu_left_prediction_sse4, 3,3,7, dst, src, w, left + mova m5, [pb_f GLOBAL] + mova m6, [pb_zzzzzzzz77777777 GLOBAL] + mova m4, [pb_zzzz3333zzzzbbbb GLOBAL] + mova m3, [pb_zz11zz55zz99zzdd GLOBAL] + movd m0, leftm + pslldq m0, 15 + test srcq, 15 + jnz ff_add_hfyu_left_prediction_ssse3 %+ .skip_prologue + test dstq, 15 + jnz .unaligned + ADD_HFYU_LEFT_LOOP 1 +.unaligned: + ADD_HFYU_LEFT_LOOP 0 + diff --git a/libavcodec/x86/x86inc.asm b/libavcodec/x86/x86inc.asm index 52624c3aca..e49c34f1b1 100644 --- a/libavcodec/x86/x86inc.asm +++ b/libavcodec/x86/x86inc.asm @@ -221,6 +221,7 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7 CAT_UNDEF arg_name %+ %%i, d CAT_UNDEF arg_name %+ %%i, w CAT_UNDEF arg_name %+ %%i, b + CAT_UNDEF arg_name %+ %%i, m CAT_UNDEF arg_name, %%i %assign %%i %%i+1 %endrep @@ -232,6 +233,7 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7 %xdefine %1d r %+ %%i %+ d %xdefine %1w r %+ %%i %+ w %xdefine %1b r %+ %%i %+ b + %xdefine %1m r %+ %%i %+ m CAT_XDEFINE arg_name, %%i, %1 %assign %%i %%i+1 %rotate 1