From ad733089b024e4a3ff8f024d247a032f79a50ac8 Mon Sep 17 00:00:00 2001 From: Timothy Gu Date: Fri, 23 May 2014 17:18:09 -0700 Subject: [PATCH] x86: dsputilenc: convert ff_sse{8, 16}_mmx() to yasm Signed-off-by: Timothy Gu Signed-off-by: Michael Niedermayer --- libavcodec/x86/dsputilenc.asm | 52 +++++++++--- libavcodec/x86/dsputilenc_mmx.c | 141 ++------------------------------ 2 files changed, 47 insertions(+), 146 deletions(-) diff --git a/libavcodec/x86/dsputilenc.asm b/libavcodec/x86/dsputilenc.asm index 46330fe116..c6f33dcc26 100644 --- a/libavcodec/x86/dsputilenc.asm +++ b/libavcodec/x86/dsputilenc.asm @@ -274,19 +274,27 @@ INIT_XMM ssse3 %define ABS_SUM_8x8 ABS_SUM_8x8_64 HADAMARD8_DIFF 9 -INIT_XMM sse2 -; int ff_sse16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, -; int line_size, int h); -cglobal sse16, 5, 5, 8 - shr r4d, 1 +; int ff_sse*_*(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, +; int line_size, int h) + +%macro SUM_SQUARED_ERRORS 2 +cglobal sse%1, 5,5,%2, v, pix1, pix2, lsize, h +%if %1 == mmsize + shr hd, 1 +%endif pxor m0, m0 ; mm0 = 0 pxor m7, m7 ; mm7 holds the sum .next2lines: ; FIXME why are these unaligned movs? pix1[] is aligned - movu m1, [r1 ] ; mm1 = pix1[0][0-15] - movu m2, [r2 ] ; mm2 = pix2[0][0-15] - movu m3, [r1+r3] ; mm3 = pix1[1][0-15] - movu m4, [r2+r3] ; mm4 = pix2[1][0-15] + movu m1, [pix1q] ; m1 = pix1[0][0-15], [0-7] for mmx + movu m2, [pix2q] ; m2 = pix2[0][0-15], [0-7] for mmx +%if %1 == mmsize + movu m3, [pix1q+lsizeq] ; m3 = pix1[1][0-15], [0-7] for mmx + movu m4, [pix2q+lsizeq] ; m4 = pix2[1][0-15], [0-7] for mmx +%else ; %1 / 2 == mmsize; mmx only + mova m3, [pix1q+8] ; m3 = pix1[0][8-15] + mova m4, [pix2q+8] ; m4 = pix2[0][8-15] +%endif ; todo: mm1-mm2, mm3-mm4 ; algo: subtract mm1 from mm2 with saturation and vice versa @@ -315,25 +323,43 @@ cglobal sse16, 5, 5, 8 pmaddwd m1, m1 pmaddwd m3, m3 - lea r1, [r1+r3*2] ; pix1 += 2*line_size - lea r2, [r2+r3*2] ; pix2 += 2*line_size - paddd m1, m2 paddd m3, m4 paddd m7, m1 paddd m7, m3 - dec r4 +%if %1 == mmsize + lea pix1q, [pix1q + 2*lsizeq] + lea pix2q, [pix2q + 2*lsizeq] +%else + add pix1q, lsizeq + add pix2q, lsizeq +%endif + dec hd jnz .next2lines mova m1, m7 +%if mmsize == 8 + psrlq m7, 32 ; shift hi dword to lo +%else psrldq m7, 8 ; shift hi qword to lo paddd m7, m1 mova m1, m7 psrldq m7, 4 ; shift hi dword to lo +%endif paddd m7, m1 movd eax, m7 ; return value RET +%endmacro + +INIT_MMX mmx +SUM_SQUARED_ERRORS 8, 0 + +INIT_MMX mmx +SUM_SQUARED_ERRORS 16, 0 + +INIT_XMM sse2 +SUM_SQUARED_ERRORS 16, 8 INIT_MMX mmx ; void ff_get_pixels_mmx(int16_t *block, const uint8_t *pixels, int line_size) diff --git a/libavcodec/x86/dsputilenc_mmx.c b/libavcodec/x86/dsputilenc_mmx.c index e63d510ab9..a7518eb1c8 100644 --- a/libavcodec/x86/dsputilenc_mmx.c +++ b/libavcodec/x86/dsputilenc_mmx.c @@ -42,138 +42,13 @@ int ff_sum_abs_dctelem_mmx(int16_t *block); int ff_sum_abs_dctelem_mmxext(int16_t *block); int ff_sum_abs_dctelem_sse2(int16_t *block); int ff_sum_abs_dctelem_ssse3(int16_t *block); +int ff_sse8_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, + int line_size, int h); +int ff_sse16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, + int line_size, int h); #if HAVE_INLINE_ASM -static int sse8_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, - int line_size, int h) -{ - int tmp; - - __asm__ volatile ( - "movl %4, %%ecx \n" - "shr $1, %%ecx \n" - "pxor %%mm0, %%mm0 \n" /* mm0 = 0 */ - "pxor %%mm7, %%mm7 \n" /* mm7 holds the sum */ - "1: \n" - "movq (%0), %%mm1 \n" /* mm1 = pix1[0][0 - 7] */ - "movq (%1), %%mm2 \n" /* mm2 = pix2[0][0 - 7] */ - "movq (%0, %3), %%mm3 \n" /* mm3 = pix1[1][0 - 7] */ - "movq (%1, %3), %%mm4 \n" /* mm4 = pix2[1][0 - 7] */ - - /* todo: mm1-mm2, mm3-mm4 */ - /* algo: subtract mm1 from mm2 with saturation and vice versa */ - /* OR the results to get absolute difference */ - "movq %%mm1, %%mm5 \n" - "movq %%mm3, %%mm6 \n" - "psubusb %%mm2, %%mm1 \n" - "psubusb %%mm4, %%mm3 \n" - "psubusb %%mm5, %%mm2 \n" - "psubusb %%mm6, %%mm4 \n" - - "por %%mm1, %%mm2 \n" - "por %%mm3, %%mm4 \n" - - /* now convert to 16-bit vectors so we can square them */ - "movq %%mm2, %%mm1 \n" - "movq %%mm4, %%mm3 \n" - - "punpckhbw %%mm0, %%mm2 \n" - "punpckhbw %%mm0, %%mm4 \n" - "punpcklbw %%mm0, %%mm1 \n" /* mm1 now spread over (mm1, mm2) */ - "punpcklbw %%mm0, %%mm3 \n" /* mm4 now spread over (mm3, mm4) */ - - "pmaddwd %%mm2, %%mm2 \n" - "pmaddwd %%mm4, %%mm4 \n" - "pmaddwd %%mm1, %%mm1 \n" - "pmaddwd %%mm3, %%mm3 \n" - - "lea (%0, %3, 2), %0 \n" /* pix1 += 2 * line_size */ - "lea (%1, %3, 2), %1 \n" /* pix2 += 2 * line_size */ - - "paddd %%mm2, %%mm1 \n" - "paddd %%mm4, %%mm3 \n" - "paddd %%mm1, %%mm7 \n" - "paddd %%mm3, %%mm7 \n" - - "decl %%ecx \n" - "jnz 1b \n" - - "movq %%mm7, %%mm1 \n" - "psrlq $32, %%mm7 \n" /* shift hi dword to lo */ - "paddd %%mm7, %%mm1 \n" - "movd %%mm1, %2 \n" - : "+r" (pix1), "+r" (pix2), "=r" (tmp) - : "r" ((x86_reg) line_size), "m" (h) - : "%ecx"); - - return tmp; -} - -static int sse16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, - int line_size, int h) -{ - int tmp; - - __asm__ volatile ( - "movl %4, %%ecx\n" - "pxor %%mm0, %%mm0\n" /* mm0 = 0 */ - "pxor %%mm7, %%mm7\n" /* mm7 holds the sum */ - "1:\n" - "movq (%0), %%mm1\n" /* mm1 = pix1[0 - 7] */ - "movq (%1), %%mm2\n" /* mm2 = pix2[0 - 7] */ - "movq 8(%0), %%mm3\n" /* mm3 = pix1[8 - 15] */ - "movq 8(%1), %%mm4\n" /* mm4 = pix2[8 - 15] */ - - /* todo: mm1-mm2, mm3-mm4 */ - /* algo: subtract mm1 from mm2 with saturation and vice versa */ - /* OR the results to get absolute difference */ - "movq %%mm1, %%mm5\n" - "movq %%mm3, %%mm6\n" - "psubusb %%mm2, %%mm1\n" - "psubusb %%mm4, %%mm3\n" - "psubusb %%mm5, %%mm2\n" - "psubusb %%mm6, %%mm4\n" - - "por %%mm1, %%mm2\n" - "por %%mm3, %%mm4\n" - - /* now convert to 16-bit vectors so we can square them */ - "movq %%mm2, %%mm1\n" - "movq %%mm4, %%mm3\n" - - "punpckhbw %%mm0, %%mm2\n" - "punpckhbw %%mm0, %%mm4\n" - "punpcklbw %%mm0, %%mm1\n" /* mm1 now spread over (mm1, mm2) */ - "punpcklbw %%mm0, %%mm3\n" /* mm4 now spread over (mm3, mm4) */ - - "pmaddwd %%mm2, %%mm2\n" - "pmaddwd %%mm4, %%mm4\n" - "pmaddwd %%mm1, %%mm1\n" - "pmaddwd %%mm3, %%mm3\n" - - "add %3, %0\n" - "add %3, %1\n" - - "paddd %%mm2, %%mm1\n" - "paddd %%mm4, %%mm3\n" - "paddd %%mm1, %%mm7\n" - "paddd %%mm3, %%mm7\n" - - "decl %%ecx\n" - "jnz 1b\n" - - "movq %%mm7, %%mm1\n" - "psrlq $32, %%mm7\n" /* shift hi dword to lo */ - "paddd %%mm7, %%mm1\n" - "movd %%mm1, %2\n" - : "+r" (pix1), "+r" (pix2), "=r" (tmp) - : "r" ((x86_reg) line_size), "m" (h) - : "%ecx"); - - return tmp; -} - static int hf_noise8_mmx(uint8_t *pix1, int line_size, int h) { int tmp; @@ -427,7 +302,7 @@ static int nsse16_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2, if (c) score1 = c->dsp.sse[0](c, pix1, pix2, line_size, h); else - score1 = sse16_mmx(c, pix1, pix2, line_size, h); + score1 = ff_sse16_mmx(c, pix1, pix2, line_size, h); score2 = hf_noise16_mmx(pix1, line_size, h) - hf_noise16_mmx(pix2, line_size, h); @@ -440,7 +315,7 @@ static int nsse16_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2, static int nsse8_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2, int line_size, int h) { - int score1 = sse8_mmx(c, pix1, pix2, line_size, h); + int score1 = ff_sse8_mmx(c, pix1, pix2, line_size, h); int score2 = hf_noise8_mmx(pix1, line_size, h) - hf_noise8_mmx(pix2, line_size, h); @@ -891,6 +766,8 @@ av_cold void ff_dsputilenc_init_mmx(DSPContext *c, AVCodecContext *avctx, c->diff_pixels = ff_diff_pixels_mmx; c->pix_sum = ff_pix_sum16_mmx; c->pix_norm1 = ff_pix_norm1_mmx; + c->sse[0] = ff_sse16_mmx; + c->sse[1] = ff_sse8_mmx; } if (EXTERNAL_SSE2(cpu_flags)) @@ -904,8 +781,6 @@ av_cold void ff_dsputilenc_init_mmx(DSPContext *c, AVCodecContext *avctx, c->fdct = ff_fdct_mmx; c->diff_bytes = diff_bytes_mmx; - c->sse[0] = sse16_mmx; - c->sse[1] = sse8_mmx; c->vsad[4] = vsad_intra16_mmx; c->nsse[0] = nsse16_mmx;