From aeae5d537b35356a783e156fb218eb161d7eb93e Mon Sep 17 00:00:00 2001
From: Michael Niedermayer <michaelni@gmx.at>
Date: Tue, 18 Feb 2003 19:22:34 +0000
Subject: [PATCH] optimize

Originally committed as revision 9455 to svn://svn.mplayerhq.hu/mplayer/trunk/postproc
---
 postproc/rgb2rgb.c          |  5 +++
 postproc/rgb2rgb_template.c | 74 ++++++++++++++++++++++++++++++++++++-
 2 files changed, 77 insertions(+), 2 deletions(-)

diff --git a/postproc/rgb2rgb.c b/postproc/rgb2rgb.c
index be21af0828..c07301c3f3 100644
--- a/postproc/rgb2rgb.c
+++ b/postproc/rgb2rgb.c
@@ -28,6 +28,11 @@ static const uint64_t mask32b  __attribute__((aligned(8))) = 0x000000FF000000FFU
 static const uint64_t mask32g  __attribute__((aligned(8))) = 0x0000FF000000FF00ULL;
 static const uint64_t mask32r  __attribute__((aligned(8))) = 0x00FF000000FF0000ULL;
 static const uint64_t mask32   __attribute__((aligned(8))) = 0x00FFFFFF00FFFFFFULL;
+static const uint64_t mask3216br __attribute__((aligned(8)))=0x00F800F800F800F8ULL;
+static const uint64_t mask3216g  __attribute__((aligned(8)))=0x0000FC000000FC00ULL;
+static const uint64_t mask3215g  __attribute__((aligned(8)))=0x0000F8000000F800ULL;
+static const uint64_t mul3216  __attribute__((aligned(8))) = 0x2000000420000004ULL;
+static const uint64_t mul3215  __attribute__((aligned(8))) = 0x2000000820000008ULL;
 static const uint64_t mask24b  __attribute__((aligned(8))) = 0x00FF0000FF0000FFULL;
 static const uint64_t mask24g  __attribute__((aligned(8))) = 0xFF0000FF0000FF00ULL;
 static const uint64_t mask24r  __attribute__((aligned(8))) = 0x0000FF0000FF0000ULL;
diff --git a/postproc/rgb2rgb_template.c b/postproc/rgb2rgb_template.c
index e299b0c12e..01ba6ed6f2 100644
--- a/postproc/rgb2rgb_template.c
+++ b/postproc/rgb2rgb_template.c
@@ -318,12 +318,46 @@ static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, unsigned
 	uint16_t *d = (uint16_t *)dst;
 	end = s + src_size;
 #ifdef HAVE_MMX
+	mm_end = end - 15;
+#if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster)
+	asm volatile(
+		"movq %3, %%mm5			\n\t"
+		"movq %4, %%mm6			\n\t"
+		"movq %5, %%mm7			\n\t"
+		".balign 16			\n\t"
+		"1:				\n\t"
+		PREFETCH" 32(%1)		\n\t"
+		"movd	(%1), %%mm0		\n\t"
+		"movd	4(%1), %%mm3		\n\t"
+		"punpckldq 8(%1), %%mm0		\n\t"
+		"punpckldq 12(%1), %%mm3	\n\t"
+		"movq %%mm0, %%mm1		\n\t"
+		"movq %%mm3, %%mm4		\n\t"
+		"pand %%mm6, %%mm0		\n\t"
+		"pand %%mm6, %%mm3		\n\t"
+		"pmaddwd %%mm7, %%mm0		\n\t"
+		"pmaddwd %%mm7, %%mm3		\n\t"
+		"pand %%mm5, %%mm1		\n\t"
+		"pand %%mm5, %%mm4		\n\t"
+		"por %%mm1, %%mm0		\n\t"	
+		"por %%mm4, %%mm3		\n\t"
+		"psrld $5, %%mm0		\n\t"
+		"pslld $11, %%mm3		\n\t"
+		"por %%mm3, %%mm0		\n\t"
+		MOVNTQ"	%%mm0, (%0)		\n\t"
+		"addl $16, %1			\n\t"
+		"addl $8, %0			\n\t"
+		"cmpl %2, %1			\n\t"
+		" jb 1b				\n\t"
+		: "+r" (d), "+r"(s)
+		: "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
+	);
+#else
 	__asm __volatile(PREFETCH"	%0"::"m"(*src):"memory");
 	__asm __volatile(
 	    "movq	%0, %%mm7\n\t"
 	    "movq	%1, %%mm6\n\t"
 	    ::"m"(red_16mask),"m"(green_16mask));
-	mm_end = end - 15;
 	while(s < mm_end)
 	{
 	    __asm __volatile(
@@ -359,6 +393,7 @@ static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, unsigned
 		d += 4;
 		s += 16;
 	}
+#endif
 	__asm __volatile(SFENCE:::"memory");
 	__asm __volatile(EMMS:::"memory");
 #endif
@@ -441,12 +476,46 @@ static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, unsigned
 	uint16_t *d = (uint16_t *)dst;
 	end = s + src_size;
 #ifdef HAVE_MMX
+	mm_end = end - 15;
+#if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster)
+	asm volatile(
+		"movq %3, %%mm5			\n\t"
+		"movq %4, %%mm6			\n\t"
+		"movq %5, %%mm7			\n\t"
+		".balign 16			\n\t"
+		"1:				\n\t"
+		PREFETCH" 32(%1)		\n\t"
+		"movd	(%1), %%mm0		\n\t"
+		"movd	4(%1), %%mm3		\n\t"
+		"punpckldq 8(%1), %%mm0		\n\t"
+		"punpckldq 12(%1), %%mm3	\n\t"
+		"movq %%mm0, %%mm1		\n\t"
+		"movq %%mm3, %%mm4		\n\t"
+		"pand %%mm6, %%mm0		\n\t"
+		"pand %%mm6, %%mm3		\n\t"
+		"pmaddwd %%mm7, %%mm0		\n\t"
+		"pmaddwd %%mm7, %%mm3		\n\t"
+		"pand %%mm5, %%mm1		\n\t"
+		"pand %%mm5, %%mm4		\n\t"
+		"por %%mm1, %%mm0		\n\t"	
+		"por %%mm4, %%mm3		\n\t"
+		"psrld $6, %%mm0		\n\t"
+		"pslld $10, %%mm3		\n\t"
+		"por %%mm3, %%mm0		\n\t"
+		MOVNTQ"	%%mm0, (%0)		\n\t"
+		"addl $16, %1			\n\t"
+		"addl $8, %0			\n\t"
+		"cmpl %2, %1			\n\t"
+		" jb 1b				\n\t"
+		: "+r" (d), "+r"(s)
+		: "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
+	);
+#else
 	__asm __volatile(PREFETCH"	%0"::"m"(*src):"memory");
 	__asm __volatile(
 	    "movq	%0, %%mm7\n\t"
 	    "movq	%1, %%mm6\n\t"
 	    ::"m"(red_15mask),"m"(green_15mask));
-	mm_end = end - 15;
 	while(s < mm_end)
 	{
 	    __asm __volatile(
@@ -482,6 +551,7 @@ static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, unsigned
 		d += 4;
 		s += 16;
 	}
+#endif
 	__asm __volatile(SFENCE:::"memory");
 	__asm __volatile(EMMS:::"memory");
 #endif