diff --git a/libswscale/rgb2rgb.c b/libswscale/rgb2rgb.c index 641965efe0..3746b9b2f0 100644 --- a/libswscale/rgb2rgb.c +++ b/libswscale/rgb2rgb.c @@ -99,6 +99,7 @@ void (*yuyvtoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t #if ARCH_X86 +DECLARE_ASM_CONST(8, uint64_t, mmx_ff) = 0x00000000000000FFULL; DECLARE_ASM_CONST(8, uint64_t, mmx_null) = 0x0000000000000000ULL; DECLARE_ASM_CONST(8, uint64_t, mmx_one) = 0xFFFFFFFFFFFFFFFFULL; DECLARE_ASM_CONST(8, uint64_t, mask32b) = 0x000000FF000000FFULL; diff --git a/libswscale/rgb2rgb_template.c b/libswscale/rgb2rgb_template.c index 663514a76e..9c78ff6e51 100644 --- a/libswscale/rgb2rgb_template.c +++ b/libswscale/rgb2rgb_template.c @@ -1773,13 +1773,22 @@ static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, long srcWi const x86_reg mmxSize= srcWidth&~15; __asm__ volatile( "mov %4, %%"REG_a" \n\t" + "movq "MANGLE(mmx_ff)", %%mm0 \n\t" + "movq (%0, %%"REG_a"), %%mm4 \n\t" + "movq %%mm4, %%mm2 \n\t" + "psllq $8, %%mm4 \n\t" + "pand %%mm0, %%mm2 \n\t" + "por %%mm2, %%mm4 \n\t" + "movq (%1, %%"REG_a"), %%mm5 \n\t" + "movq %%mm5, %%mm3 \n\t" + "psllq $8, %%mm5 \n\t" + "pand %%mm0, %%mm3 \n\t" + "por %%mm3, %%mm5 \n\t" "1: \n\t" "movq (%0, %%"REG_a"), %%mm0 \n\t" "movq (%1, %%"REG_a"), %%mm1 \n\t" "movq 1(%0, %%"REG_a"), %%mm2 \n\t" "movq 1(%1, %%"REG_a"), %%mm3 \n\t" - "movq -1(%0, %%"REG_a"), %%mm4 \n\t" - "movq -1(%1, %%"REG_a"), %%mm5 \n\t" PAVGB" %%mm0, %%mm5 \n\t" PAVGB" %%mm0, %%mm3 \n\t" PAVGB" %%mm0, %%mm5 \n\t" @@ -1806,6 +1815,8 @@ static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, long srcWi "movq %%mm6, 8(%3, %%"REG_a", 2) \n\t" #endif "add $8, %%"REG_a" \n\t" + "movq -1(%0, %%"REG_a"), %%mm4 \n\t" + "movq -1(%1, %%"REG_a"), %%mm5 \n\t" " js 1b \n\t" :: "r" (src + mmxSize ), "r" (src + srcStride + mmxSize ), "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2), @@ -1815,9 +1826,9 @@ static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, long srcWi ); #else const x86_reg mmxSize=1; -#endif dst[0 ]= (3*src[0] + src[srcStride])>>2; dst[dstStride]= ( src[0] + 3*src[srcStride])>>2; +#endif for (x=mmxSize-1; x>2;