diff --git a/libavcodec/x86/dsputilenc.asm b/libavcodec/x86/dsputilenc.asm index 7426c01dbb..ba8a1773c2 100644 --- a/libavcodec/x86/dsputilenc.asm +++ b/libavcodec/x86/dsputilenc.asm @@ -419,6 +419,31 @@ cglobal diff_pixels, 4,5 jne .loop REP_RET +INIT_XMM sse2 +cglobal diff_pixels, 4, 5, 5 + movsxdifnidn r3, r3d + pxor m4, m4 + add r0, 128 + mov r4, -128 +.loop: + movh m0, [r1] + movh m2, [r2] + movh m1, [r1+r3] + movh m3, [r2+r3] + punpcklbw m0, m4 + punpcklbw m1, m4 + punpcklbw m2, m4 + punpcklbw m3, m4 + psubw m0, m2 + psubw m1, m3 + mova [r0+r4+0 ], m0 + mova [r0+r4+16], m1 + lea r1, [r1+r3*2] + lea r2, [r2+r3*2] + add r4, 32 + jne .loop + RET + INIT_MMX mmx ; int ff_pix_sum16_mmx(uint8_t *pix, int line_size) cglobal pix_sum16, 2, 3 diff --git a/libavcodec/x86/dsputilenc_mmx.c b/libavcodec/x86/dsputilenc_mmx.c index e63d510ab9..acff94702f 100644 --- a/libavcodec/x86/dsputilenc_mmx.c +++ b/libavcodec/x86/dsputilenc_mmx.c @@ -36,6 +36,8 @@ void ff_get_pixels_mmx(int16_t *block, const uint8_t *pixels, int line_size); void ff_get_pixels_sse2(int16_t *block, const uint8_t *pixels, int line_size); void ff_diff_pixels_mmx(int16_t *block, const uint8_t *s1, const uint8_t *s2, int stride); +void ff_diff_pixels_sse2(int16_t *block, const uint8_t *s1, const uint8_t *s2, + int stride); int ff_pix_sum16_mmx(uint8_t *pix, int line_size); int ff_pix_norm1_mmx(uint8_t *pix, int line_size); int ff_sum_abs_dctelem_mmx(int16_t *block); @@ -971,6 +973,7 @@ av_cold void ff_dsputilenc_init_mmx(DSPContext *c, AVCodecContext *avctx, if (EXTERNAL_SSE2(cpu_flags)) { c->sse[0] = ff_sse16_sse2; c->sum_abs_dctelem = ff_sum_abs_dctelem_sse2; + c->diff_pixels = ff_diff_pixels_sse2; #if HAVE_ALIGNED_STACK c->hadamard8_diff[0] = ff_hadamard8_diff16_sse2;