@ -25,37 +25,76 @@
# include "libavutil/x86/asm.h"
/* A single TEMPLATE_PP_* should be defined (to 1) when this template is
* included . The following macros will define its dependencies to 1 as well
* ( like MMX2 depending on MMX ) , and will define to 0 all the others . Every
* TEMPLATE_PP_ * need to be undef at the end . */
# ifdef TEMPLATE_PP_C
# define RENAME(a) a ## _C
# else
# define TEMPLATE_PP_C 0
# endif
# ifdef TEMPLATE_PP_ALTIVEC
# define RENAME(a) a ## _altivec
# else
# define TEMPLATE_PP_ALTIVEC 0
# endif
# ifdef TEMPLATE_PP_MMX
# define RENAME(a) a ## _MMX
# else
# define TEMPLATE_PP_MMX 0
# endif
# ifdef TEMPLATE_PP_MMXEXT
# undef TEMPLATE_PP_MMX
# define TEMPLATE_PP_MMX 1
# define RENAME(a) a ## _MMX2
# else
# define TEMPLATE_PP_MMXEXT 0
# endif
# ifdef TEMPLATE_PP_3DNOW
# undef TEMPLATE_PP_MMX
# define TEMPLATE_PP_MMX 1
# define RENAME(a) a ## _3DNow
# else
# define TEMPLATE_PP_3DNOW 0
# endif
# undef REAL_PAVGB
# undef PAVGB
# undef PMINUB
# undef PMAXUB
# if HAVE_MMXEXT_INLINE
# if TEMPLATE_PP_MMXEXT
# define REAL_PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
# elif HAVE_AMD3DNOW_INLINE
# elif TEMPLATE_PP_3DNOW
# define REAL_PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
# endif
# define PAVGB(a,b) REAL_PAVGB(a,b)
# if HAVE_MMXEXT_INLINE
# if TEMPLATE_PP_MMXEXT
# define PMINUB(a,b,t) "pminub " #a ", " #b " \n\t"
# elif HAVE_MMX_INLINE
# elif TEMPLATE_PP_MMX
# define PMINUB(b,a,t) \
" movq " # a " , " # t " \n \t " \
" psubusb " # b " , " # t " \n \t " \
" psubb " # t " , " # a " \n \t "
# endif
# if HAVE_MMXEXT_INLINE
# if TEMPLATE_PP_MMXEXT
# define PMAXUB(a,b) "pmaxub " #a ", " #b " \n\t"
# elif HAVE_MMX_INLINE
# elif TEMPLATE_PP_MMX
# define PMAXUB(a,b) \
" psubusb " # a " , " # b " \n \t " \
" paddb " # a " , " # b " \n \t "
# endif
//FIXME? |255-0| = 1 (should not be a problem ...)
# if HAVE_MMX_INLINE
# if TEMPLATE_PP_MMX
/**
* Check if the middle 8 x8 Block in the given 8 x16 block is flat
*/
@ -135,7 +174,7 @@ static inline int RENAME(vertClassify)(uint8_t src[], int stride, PPContext *c){
" psubusb %%mm3, %%mm4 \n \t "
" \n \t "
# if HAVE_MMXEXT_INLINE
# if TEMPLATE_PP_MMXEXT
" pxor %%mm7, %%mm7 \n \t "
" psadbw %%mm7, %%mm0 \n \t "
# else
@ -169,16 +208,16 @@ static inline int RENAME(vertClassify)(uint8_t src[], int stride, PPContext *c){
return 2 ;
}
}
# endif //HAVE_MMX_INLINE
# endif //TEMPLATE_PP_MMX
/**
* Do a vertical low pass filter on the 8 x16 block ( only write to the 8 x8 block in the middle )
* using the 9 - Tap Filter ( 1 , 1 , 2 , 2 , 4 , 2 , 2 , 1 , 1 ) / 16
*/
# if !HAVE _ALTIVEC
# if !TEMPLATE_PP _ALTIVEC
static inline void RENAME ( doVertLowPass ) ( uint8_t * src , int stride , PPContext * c )
{
# if HAVE_MMXEXT_INLINE || HAVE_AMD3DNOW_INLINE
# if TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
src + = stride * 3 ;
__asm__ volatile ( //"movv %0 %1 %2\n\t"
" movq %2, %%mm0 \n \t " // QP,..., QP
@ -305,7 +344,7 @@ static inline void RENAME(doVertLowPass)(uint8_t *src, int stride, PPContext *c)
: " r " ( src ) , " r " ( ( x86_reg ) stride ) , " m " ( c - > pQPb )
: " % " REG_a , " % " REG_c
) ;
# else //HAVE_MMXEXT_INLINE || HAVE_AMD3DNOW_INLINE
# else //TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
const int l1 = stride ;
const int l2 = stride + l1 ;
const int l3 = stride + l2 ;
@ -344,9 +383,9 @@ static inline void RENAME(doVertLowPass)(uint8_t *src, int stride, PPContext *c)
src + + ;
}
# endif //HAVE_MMXEXT_INLINE || HAVE_AMD3DNOW_INLINE
# endif //TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
}
# endif //HAVE _ALTIVEC
# endif //TEMPLATE_PP _ALTIVEC
/**
* Experimental Filter 1
@ -357,7 +396,7 @@ static inline void RENAME(doVertLowPass)(uint8_t *src, int stride, PPContext *c)
*/
static inline void RENAME ( vertX1Filter ) ( uint8_t * src , int stride , PPContext * co )
{
# if HAVE_MMXEXT_INLINE || HAVE_AMD3DNOW_INLINE
# if TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
src + = stride * 3 ;
__asm__ volatile (
@ -443,7 +482,7 @@ static inline void RENAME(vertX1Filter)(uint8_t *src, int stride, PPContext *co)
: " r " ( src ) , " r " ( ( x86_reg ) stride ) , " m " ( co - > pQPb )
: " % " REG_a , " % " REG_c
) ;
# else //HAVE_MMXEXT_INLINE || HAVE_AMD3DNOW_INLINE
# else //TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
const int l1 = stride ;
const int l2 = stride + l1 ;
@ -477,13 +516,13 @@ static inline void RENAME(vertX1Filter)(uint8_t *src, int stride, PPContext *co)
}
src + + ;
}
# endif //HAVE_MMXEXT_INLINE || HAVE_AMD3DNOW_INLINE
# endif //TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
}
# if !HAVE _ALTIVEC
# if !TEMPLATE_PP _ALTIVEC
static inline void RENAME ( doVertDefFilter ) ( uint8_t src [ ] , int stride , PPContext * c )
{
# if HAVE_MMXEXT_INLINE || HAVE_AMD3DNOW_INLINE
# if TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
/*
uint8_t tmp [ 16 ] ;
const int l1 = stride ;
@ -764,7 +803,7 @@ static inline void RENAME(doVertDefFilter)(uint8_t src[], int stride, PPContext
}
}
*/
# elif HAVE_MMX_INLINE
# elif TEMPLATE_PP_MMX
DECLARE_ALIGNED ( 8 , uint64_t , tmp ) [ 4 ] ; // make space for 4 8-byte vars
src + = stride * 4 ;
__asm__ volatile (
@ -872,7 +911,7 @@ static inline void RENAME(doVertDefFilter)(uint8_t src[], int stride, PPContext
" movq (%3), %%mm2 \n \t " // 2L0 - 5L1 + 5L2 - 2L3
" movq 8(%3), %%mm3 \n \t " // 2H0 - 5H1 + 5H2 - 2H3
# if HAVE_MMXEXT_INLINE
# if TEMPLATE_PP_MMXEXT
" movq %%mm7, %%mm6 \n \t " // 0
" psubw %%mm0, %%mm6 \n \t "
" pmaxsw %%mm6, %%mm0 \n \t " // |2L4 - 5L5 + 5L6 - 2L7|
@ -904,7 +943,7 @@ static inline void RENAME(doVertDefFilter)(uint8_t src[], int stride, PPContext
" psubw %%mm6, %%mm3 \n \t " // |2H0 - 5H1 + 5H2 - 2H3|
# endif
# if HAVE_MMXEXT_INLINE
# if TEMPLATE_PP_MMXEXT
" pminsw %%mm2, %%mm0 \n \t "
" pminsw %%mm3, %%mm1 \n \t "
# else
@ -968,7 +1007,7 @@ static inline void RENAME(doVertDefFilter)(uint8_t src[], int stride, PPContext
" pand %%mm2, %%mm4 \n \t "
" pand %%mm3, %%mm5 \n \t "
# if HAVE_MMXEXT_INLINE
# if TEMPLATE_PP_MMXEXT
" pminsw %%mm0, %%mm4 \n \t "
" pminsw %%mm1, %%mm5 \n \t "
# else
@ -995,7 +1034,7 @@ static inline void RENAME(doVertDefFilter)(uint8_t src[], int stride, PPContext
: " r " ( ( x86_reg ) stride ) , " m " ( c - > pQPb ) , " r " ( tmp )
: " % " REG_a
) ;
# else //HAVE_MMXEXT_INLINE || HAVE_AMD3DNOW_INLINE
# else //TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
const int l1 = stride ;
const int l2 = stride + l1 ;
const int l3 = stride + l2 ;
@ -1033,14 +1072,14 @@ static inline void RENAME(doVertDefFilter)(uint8_t src[], int stride, PPContext
}
src + + ;
}
# endif //HAVE_MMXEXT_INLINE || HAVE_AMD3DNOW_INLINE
# endif //TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
}
# endif //HAVE _ALTIVEC
# endif //TEMPLATE_PP _ALTIVEC
# if !HAVE _ALTIVEC
# if !TEMPLATE_PP _ALTIVEC
static inline void RENAME ( dering ) ( uint8_t src [ ] , int stride , PPContext * c )
{
# if HAVE_MMXEXT_INLINE || HAVE_AMD3DNOW_INLINE
# if TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
DECLARE_ALIGNED ( 8 , uint64_t , tmp ) [ 3 ] ;
__asm__ volatile (
" pxor %%mm6, %%mm6 \n \t "
@ -1060,7 +1099,7 @@ static inline void RENAME(dering)(uint8_t src[], int stride, PPContext *c)
# undef REAL_FIND_MIN_MAX
# undef FIND_MIN_MAX
# if HAVE_MMXEXT_INLINE
# if TEMPLATE_PP_MMXEXT
# define REAL_FIND_MIN_MAX(addr)\
" movq " # addr " , %%mm0 \n \t " \
" pminub %%mm0, %%mm7 \n \t " \
@ -1087,7 +1126,7 @@ FIND_MIN_MAX((%0, %1, 8))
" movq %%mm7, %%mm4 \n \t "
" psrlq $8, %%mm7 \n \t "
# if HAVE_MMXEXT_INLINE
# if TEMPLATE_PP_MMXEXT
" pminub %%mm4, %%mm7 \n \t " // min of pixels
" pshufw $0xF9, %%mm7, %%mm4 \n \t "
" pminub %%mm4, %%mm7 \n \t " // min of pixels
@ -1112,7 +1151,7 @@ FIND_MIN_MAX((%0, %1, 8))
" movq %%mm6, %%mm4 \n \t "
" psrlq $8, %%mm6 \n \t "
# if HAVE_MMXEXT_INLINE
# if TEMPLATE_PP_MMXEXT
" pmaxub %%mm4, %%mm6 \n \t " // max of pixels
" pshufw $0xF9, %%mm6, %%mm4 \n \t "
" pmaxub %%mm4, %%mm6 \n \t "
@ -1266,7 +1305,7 @@ DERING_CORE((%0, %1, 8) ,(%%REGd, %1, 4),%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,
: : " r " ( src ) , " r " ( ( x86_reg ) stride ) , " m " ( c - > pQPb ) , " m " ( c - > pQPb2 ) , " q " ( tmp )
: " % " REG_a , " % " REG_d
) ;
# else //HAVE_MMXEXT_INLINE || HAVE_AMD3DNOW_INLINE
# else //TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
int y ;
int min = 255 ;
int max = 0 ;
@ -1383,9 +1422,9 @@ DERING_CORE((%0, %1, 8) ,(%%REGd, %1, 4),%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,
// src[0] = src[7]=src[stride*7]=src[stride*7 + 7]=255;
}
# endif
# endif //HAVE_MMXEXT_INLINE || HAVE_AMD3DNOW_INLINE
# endif //TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
}
# endif //HAVE _ALTIVEC
# endif //TEMPLATE_PP _ALTIVEC
/**
* Deinterlace the given block by linearly interpolating every second line .
@ -1395,7 +1434,7 @@ DERING_CORE((%0, %1, 8) ,(%%REGd, %1, 4),%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,
*/
static inline void RENAME ( deInterlaceInterpolateLinear ) ( uint8_t src [ ] , int stride )
{
# if HAVE_MMXEXT_INLINE || HAVE_AMD3DNOW_INLINE
# if TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
src + = 4 * stride ;
__asm__ volatile (
" lea (%0, %1), %% " REG_a " \n \t "
@ -1448,7 +1487,7 @@ static inline void RENAME(deInterlaceInterpolateLinear)(uint8_t src[], int strid
*/
static inline void RENAME ( deInterlaceInterpolateCubic ) ( uint8_t src [ ] , int stride )
{
# if HAVE_MMXEXT_INLINE || HAVE_AMD3DNOW_INLINE
# if TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
src + = stride * 3 ;
__asm__ volatile (
" lea (%0, %1), %% " REG_a " \n \t "
@ -1490,7 +1529,7 @@ DEINT_CUBIC((%%REGd, %1), (%0, %1, 8) , (%%REGd, %1, 4), (%%REGc) , (%%REGc,
: : " r " ( src ) , " r " ( ( x86_reg ) stride )
: " % " REG_a , " % " REG_d , " % " REG_c
) ;
# else //HAVE_MMXEXT_INLINE || HAVE_AMD3DNOW_INLINE
# else //TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
int x ;
src + = stride * 3 ;
for ( x = 0 ; x < 8 ; x + + ) {
@ -1500,7 +1539,7 @@ DEINT_CUBIC((%%REGd, %1), (%0, %1, 8) , (%%REGd, %1, 4), (%%REGc) , (%%REGc,
src [ stride * 9 ] = CLIP ( ( - src [ stride * 6 ] + 9 * src [ stride * 8 ] + 9 * src [ stride * 10 ] - src [ stride * 12 ] ) > > 4 ) ;
src + + ;
}
# endif //HAVE_MMXEXT_INLINE || HAVE_AMD3DNOW_INLINE
# endif //TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
}
/**
@ -1512,7 +1551,7 @@ DEINT_CUBIC((%%REGd, %1), (%0, %1, 8) , (%%REGd, %1, 4), (%%REGc) , (%%REGc,
*/
static inline void RENAME ( deInterlaceFF ) ( uint8_t src [ ] , int stride , uint8_t * tmp )
{
# if HAVE_MMXEXT_INLINE || HAVE_AMD3DNOW_INLINE
# if TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
src + = stride * 4 ;
__asm__ volatile (
" lea (%0, %1), %% " REG_a " \n \t "
@ -1561,7 +1600,7 @@ DEINT_FF((%%REGd, %1), (%%REGd, %1, 2), (%0, %1, 8) , (%%REGd, %1, 4))
: : " r " ( src ) , " r " ( ( x86_reg ) stride ) , " r " ( tmp )
: " % " REG_a , " % " REG_d
) ;
# else //HAVE_MMXEXT_INLINE || HAVE_AMD3DNOW_INLINE
# else //TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
int x ;
src + = stride * 4 ;
for ( x = 0 ; x < 8 ; x + + ) {
@ -1579,7 +1618,7 @@ DEINT_FF((%%REGd, %1), (%%REGd, %1, 2), (%0, %1, 8) , (%%REGd, %1, 4))
src + + ;
}
# endif //HAVE_MMXEXT_INLINE || HAVE_AMD3DNOW_INLINE
# endif //TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
}
/**
@ -1591,7 +1630,7 @@ DEINT_FF((%%REGd, %1), (%%REGd, %1, 2), (%0, %1, 8) , (%%REGd, %1, 4))
*/
static inline void RENAME ( deInterlaceL5 ) ( uint8_t src [ ] , int stride , uint8_t * tmp , uint8_t * tmp2 )
{
# if HAVE_MMXEXT_INLINE || HAVE_AMD3DNOW_INLINE
# if TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
src + = stride * 4 ;
__asm__ volatile (
" lea (%0, %1), %% " REG_a " \n \t "
@ -1651,7 +1690,7 @@ DEINT_L5(%%mm1, %%mm0, (%%REGd, %1, 2), (%0, %1, 8) , (%%REGd, %1, 4))
: : " r " ( src ) , " r " ( ( x86_reg ) stride ) , " r " ( tmp ) , " r " ( tmp2 )
: " % " REG_a , " % " REG_d
) ;
# else //HAVE_MMXEXT_INLINE || HAVE_AMD3DNOW_INLINE
# else //TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
int x ;
src + = stride * 4 ;
for ( x = 0 ; x < 8 ; x + + ) {
@ -1680,7 +1719,7 @@ DEINT_L5(%%mm1, %%mm0, (%%REGd, %1, 2), (%0, %1, 8) , (%%REGd, %1, 4))
src + + ;
}
# endif //HAVE_MMXEXT_INLINE || HAVE_AMD3DNOW_INLINE
# endif //TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
}
/**
@ -1692,7 +1731,7 @@ DEINT_L5(%%mm1, %%mm0, (%%REGd, %1, 2), (%0, %1, 8) , (%%REGd, %1, 4))
*/
static inline void RENAME ( deInterlaceBlendLinear ) ( uint8_t src [ ] , int stride , uint8_t * tmp )
{
# if HAVE_MMXEXT_INLINE || HAVE_AMD3DNOW_INLINE
# if TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
src + = 4 * stride ;
__asm__ volatile (
" lea (%0, %1), %% " REG_a " \n \t "
@ -1739,7 +1778,7 @@ static inline void RENAME(deInterlaceBlendLinear)(uint8_t src[], int stride, uin
: : " r " ( src ) , " r " ( ( x86_reg ) stride ) , " r " ( tmp )
: " % " REG_a , " % " REG_d
) ;
# else //HAVE_MMXEXT_INLINE || HAVE_AMD3DNOW_INLINE
# else //TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
int a , b , c , x ;
src + = 4 * stride ;
@ -1782,7 +1821,7 @@ static inline void RENAME(deInterlaceBlendLinear)(uint8_t src[], int stride, uin
src + = 4 ;
tmp + = 4 ;
}
# endif //HAVE_MMXEXT_INLINE || HAVE_AMD3DNOW_INLINE
# endif //TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
}
/**
@ -1793,9 +1832,9 @@ static inline void RENAME(deInterlaceBlendLinear)(uint8_t src[], int stride, uin
*/
static inline void RENAME ( deInterlaceMedian ) ( uint8_t src [ ] , int stride )
{
# if HAVE_MMX_INLINE
# if TEMPLATE_PP_MMX
src + = 4 * stride ;
# if HAVE_MMXEXT_INLINE
# if TEMPLATE_PP_MMXEXT
__asm__ volatile (
" lea (%0, %1), %% " REG_a " \n \t "
" lea (%% " REG_a " , %1, 4), %% " REG_d " \n \t "
@ -1885,8 +1924,8 @@ MEDIAN((%%REGd, %1), (%%REGd, %1, 2), (%0, %1, 8))
: : " r " ( src ) , " r " ( ( x86_reg ) stride )
: " % " REG_a , " % " REG_d
) ;
# endif //HAVE_MMXEXT_INLINE
# else //HAVE_MMX_INLINE
# endif //TEMPLATE_PP_MMXEXT
# else //TEMPLATE_PP_MMX
int x , y ;
src + = 4 * stride ;
// FIXME - there should be a way to do a few columns in parallel like w/mmx
@ -1905,10 +1944,10 @@ MEDIAN((%%REGd, %1), (%%REGd, %1, 2), (%0, %1, 8))
}
src + + ;
}
# endif //HAVE_MMX_INLINE
# endif //TEMPLATE_PP_MMX
}
# if HAVE_MMX_INLINE
# if TEMPLATE_PP_MMX
/**
* Transpose and shift the given 8 x8 Block into dst1 and dst2 .
*/
@ -2073,10 +2112,10 @@ static inline void RENAME(transpose2)(uint8_t *dst, int dstStride, uint8_t *src)
: " % " REG_a , " % " REG_d
) ;
}
# endif //HAVE_MMX_INLINE
# endif //TEMPLATE_PP_MMX
//static long test=0;
# if !HAVE _ALTIVEC
# if !TEMPLATE_PP _ALTIVEC
static inline void RENAME ( tempNoiseReducer ) ( uint8_t * src , int stride ,
uint8_t * tempBlurred , uint32_t * tempBlurredPast , int * maxNoise )
{
@ -2087,7 +2126,7 @@ static inline void RENAME(tempNoiseReducer)(uint8_t *src, int stride,
# define FAST_L2_DIFF
//#define L1_DIFF //u should change the thresholds too if u try that one
# if HAVE_MMXEXT_INLINE || HAVE_AMD3DNOW_INLINE
# if TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
__asm__ volatile (
" lea (%2, %2, 2), %% " REG_a " \n \t " // 3*stride
" lea (%2, %2, 4), %% " REG_d " \n \t " // 5*stride
@ -2375,7 +2414,7 @@ L2_DIFF_CORE((%0, %%REGc) , (%1, %%REGc))
: : " r " ( src ) , " r " ( tempBlurred ) , " r " ( ( x86_reg ) stride ) , " m " ( tempBlurredPast )
: " % " REG_a , " % " REG_d , " % " REG_c , " memory "
) ;
# else //HAVE_MMXEXT_INLINE || HAVE_AMD3DNOW_INLINE
# else //TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
{
int y ;
int d = 0 ;
@ -2458,11 +2497,11 @@ Switch between
}
}
}
# endif //HAVE_MMXEXT_INLINE || HAVE_AMD3DNOW_INLINE
# endif //TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
}
# endif //HAVE _ALTIVEC
# endif //TEMPLATE_PP _ALTIVEC
# if HAVE_MMX_INLINE
# if TEMPLATE_PP_MMX
/**
* accurate deblock filter
*/
@ -2865,7 +2904,7 @@ static av_always_inline void RENAME(do_a_deblock)(uint8_t *src, int step, int st
" movq (%4), %%mm2 \n \t " // 2L0 - 5L1 + 5L2 - 2L3
" movq 8(%4), %%mm3 \n \t " // 2H0 - 5H1 + 5H2 - 2H3
# if HAVE_MMXEXT_INLINE
# if TEMPLATE_PP_MMXEXT
" movq %%mm7, %%mm6 \n \t " // 0
" psubw %%mm0, %%mm6 \n \t "
" pmaxsw %%mm6, %%mm0 \n \t " // |2L4 - 5L5 + 5L6 - 2L7|
@ -2897,7 +2936,7 @@ static av_always_inline void RENAME(do_a_deblock)(uint8_t *src, int step, int st
" psubw %%mm6, %%mm3 \n \t " // |2H0 - 5H1 + 5H2 - 2H3|
# endif
# if HAVE_MMXEXT_INLINE
# if TEMPLATE_PP_MMXEXT
" pminsw %%mm2, %%mm0 \n \t "
" pminsw %%mm3, %%mm1 \n \t "
# else
@ -2961,7 +3000,7 @@ static av_always_inline void RENAME(do_a_deblock)(uint8_t *src, int step, int st
" pand %%mm2, %%mm4 \n \t "
" pand %%mm3, %%mm5 \n \t "
# if HAVE_MMXEXT_INLINE
# if TEMPLATE_PP_MMXEXT
" pminsw %%mm0, %%mm4 \n \t "
" pminsw %%mm1, %%mm5 \n \t "
# else
@ -2998,7 +3037,7 @@ static av_always_inline void RENAME(do_a_deblock)(uint8_t *src, int step, int st
}
} */
}
# endif //HAVE_MMX_INLINE
# endif //TEMPLATE_PP_MMX
static void RENAME ( postProcess ) ( const uint8_t src [ ] , int srcStride , uint8_t dst [ ] , int dstStride , int width , int height ,
const QP_STORE_T QPs [ ] , int QPStride , int isColor , PPContext * c ) ;
@ -3013,18 +3052,18 @@ static void RENAME(postProcess)(const uint8_t src[], int srcStride, uint8_t dst[
static inline void RENAME ( blockCopy ) ( uint8_t dst [ ] , int dstStride , const uint8_t src [ ] , int srcStride ,
int levelFix , int64_t * packedOffsetAndScale )
{
# if !HAVE_MMX_INLINE
# if !TEMPLATE_PP_MMX
int i ;
# endif
if ( levelFix ) {
# if HAVE_MMX_INLINE
# if TEMPLATE_PP_MMX
__asm__ volatile (
" movq (%% " REG_a " ), %%mm2 \n \t " // packedYOffset
" movq 8(%% " REG_a " ), %%mm3 \n \t " // packedYScale
" lea (%2,%4), %% " REG_a " \n \t "
" lea (%3,%5), %% " REG_d " \n \t "
" pxor %%mm4, %%mm4 \n \t "
# if HAVE_MMXEXT_INLINE
# if TEMPLATE_PP_MMXEXT
# define REAL_SCALED_CPY(src1, src2, dst1, dst2) \
" movq " # src1 " , %%mm0 \n \t " \
" movq " # src1 " , %%mm5 \n \t " \
@ -3047,7 +3086,7 @@ static inline void RENAME(blockCopy)(uint8_t dst[], int dstStride, const uint8_t
" movq %%mm0, " # dst1 " \n \t " \
" movq %%mm1, " # dst2 " \n \t " \
# else //HAVE_MMXEXT_INLINE
# else //TEMPLATE_PP_MMXEXT
# define REAL_SCALED_CPY(src1, src2, dst1, dst2) \
" movq " # src1 " , %%mm0 \n \t " \
" movq " # src1 " , %%mm5 \n \t " \
@ -3074,7 +3113,7 @@ static inline void RENAME(blockCopy)(uint8_t dst[], int dstStride, const uint8_t
" movq %%mm0, " # dst1 " \n \t " \
" movq %%mm1, " # dst2 " \n \t " \
# endif //HAVE_MMXEXT_INLINE
# endif //TEMPLATE_PP_MMXEXT
# define SCALED_CPY(src1, src2, dst1, dst2)\
REAL_SCALED_CPY ( src1 , src2 , dst1 , dst2 )
@ -3094,13 +3133,13 @@ SCALED_CPY((%%REGa, %4), (%%REGa, %4, 2), (%%REGd, %5), (%%REGd, %5, 2))
" r " ( ( x86_reg ) dstStride )
: " % " REG_d
) ;
# else //HAVE_MMX_INLINE
# else //TEMPLATE_PP_MMX
for ( i = 0 ; i < 8 ; i + + )
memcpy ( & ( dst [ dstStride * i ] ) ,
& ( src [ srcStride * i ] ) , BLOCK_SIZE ) ;
# endif //HAVE_MMX_INLINE
# endif //TEMPLATE_PP_MMX
} else {
# if HAVE_MMX_INLINE
# if TEMPLATE_PP_MMX
__asm__ volatile (
" lea (%0,%2), %% " REG_a " \n \t "
" lea (%1,%3), %% " REG_d " \n \t "
@ -3127,11 +3166,11 @@ SIMPLE_CPY((%%REGa, %2), (%%REGa, %2, 2), (%%REGd, %3), (%%REGd, %3, 2))
" r " ( ( x86_reg ) dstStride )
: " % " REG_a , " % " REG_d
) ;
# else //HAVE_MMX_INLINE
# else //TEMPLATE_PP_MMX
for ( i = 0 ; i < 8 ; i + + )
memcpy ( & ( dst [ dstStride * i ] ) ,
& ( src [ srcStride * i ] ) , BLOCK_SIZE ) ;
# endif //HAVE_MMX_INLINE
# endif //TEMPLATE_PP_MMX
}
}
@ -3140,7 +3179,7 @@ SIMPLE_CPY((%%REGa, %2), (%%REGa, %2, 2), (%%REGd, %3), (%%REGd, %3, 2))
*/
static inline void RENAME ( duplicate ) ( uint8_t src [ ] , int stride )
{
# if HAVE_MMX_INLINE
# if TEMPLATE_PP_MMX
__asm__ volatile (
" movq (%0), %%mm0 \n \t "
" add %1, %0 \n \t "
@ -3168,8 +3207,8 @@ static void RENAME(postProcess)(const uint8_t src[], int srcStride, uint8_t dst[
{
DECLARE_ALIGNED ( 8 , PPContext , c ) = * c2 ; //copy to stack for faster access
int x , y ;
# ifdef COMPILE _TIME_MODE
const int mode = COMPILE _TIME_MODE;
# ifdef TEMPLATE_PP _TIME_MODE
const int mode = TEMPLATE_PP _TIME_MODE;
# else
const int mode = isColor ? c . ppMode . chromMode : c . ppMode . lumMode ;
# endif
@ -3177,7 +3216,7 @@ static void RENAME(postProcess)(const uint8_t src[], int srcStride, uint8_t dst[
int QPCorrecture = 256 * 256 ;
int copyAhead ;
# if HAVE_MMX_INLINE
# if TEMPLATE_PP_MMX
int i ;
# endif
@ -3190,7 +3229,7 @@ static void RENAME(postProcess)(const uint8_t src[], int srcStride, uint8_t dst[
uint8_t * const tempDst = dstStride > 0 ? c . tempDst : c . tempDst - 23 * dstStride ;
//const int mbWidth= isColor ? (width+7)>>3 : (width+15)>>4;
# if HAVE_MMX_INLINE
# if TEMPLATE_PP_MMX
for ( i = 0 ; i < 57 ; i + + ) {
int offset = ( ( i * c . ppMode . baseDcDiff ) > > 8 ) + 1 ;
int threshold = offset * 2 + 1 ;
@ -3248,7 +3287,7 @@ static void RENAME(postProcess)(const uint8_t src[], int srcStride, uint8_t dst[
scale = ( double ) ( c . ppMode . maxAllowedY - c . ppMode . minAllowedY ) / ( double ) ( white - black ) ;
# if HAVE_MMXEXT_INLINE
# if TEMPLATE_PP_MMXEXT
c . packedYScale = ( uint16_t ) ( scale * 256.0 + 0.5 ) ;
c . packedYOffset = ( ( ( black * c . packedYScale ) > > 8 ) - c . ppMode . minAllowedY ) & 0xFFFF ;
# else
@ -3281,7 +3320,7 @@ static void RENAME(postProcess)(const uint8_t src[], int srcStride, uint8_t dst[
// with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing
for ( x = 0 ; x < width ; x + = BLOCK_SIZE ) {
# if HAVE_MMXEXT_INLINE
# if TEMPLATE_PP_MMXEXT
/*
prefetchnta ( srcBlock + ( ( ( x > > 2 ) & 6 ) + 5 ) * srcStride + 32 ) ;
prefetchnta ( srcBlock + ( ( ( x > > 2 ) & 6 ) + 6 ) * srcStride + 32 ) ;
@ -3308,7 +3347,7 @@ static void RENAME(postProcess)(const uint8_t src[], int srcStride, uint8_t dst[
: " % " REG_a , " % " REG_d
) ;
# elif HAVE_AMD3DNOW_INLINE
# elif TEMPLATE_PP_3DNOW
//FIXME check if this is faster on an 3dnow chip or if it is faster without the prefetch or ...
/* prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
prefetch ( srcBlock + ( ( ( x > > 3 ) & 3 ) + 9 ) * srcStride + 32 ) ;
@ -3354,7 +3393,7 @@ static void RENAME(postProcess)(const uint8_t src[], int srcStride, uint8_t dst[
//1% speedup if these are here instead of the inner loop
const uint8_t * srcBlock = & ( src [ y * srcStride ] ) ;
uint8_t * dstBlock = & ( dst [ y * dstStride ] ) ;
# if HAVE_MMX_INLINE
# if TEMPLATE_PP_MMX
uint8_t * tempBlock1 = c . tempBlocks ;
uint8_t * tempBlock2 = c . tempBlocks + 8 ;
# endif
@ -3390,7 +3429,7 @@ static void RENAME(postProcess)(const uint8_t src[], int srcStride, uint8_t dst[
// with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing
for ( x = 0 ; x < width ; x + = BLOCK_SIZE ) {
const int stride = dstStride ;
# if HAVE_MMX_INLINE
# if TEMPLATE_PP_MMX
uint8_t * tmpXchg ;
# endif
if ( isColor ) {
@ -3404,7 +3443,7 @@ static void RENAME(postProcess)(const uint8_t src[], int srcStride, uint8_t dst[
yHistogram [ srcBlock [ srcStride * 12 + 4 ] ] + + ;
}
c . QP = QP ;
# if HAVE_MMX_INLINE
# if TEMPLATE_PP_MMX
__asm__ volatile (
" movd %1, %%mm7 \n \t "
" packuswb %%mm7, %%mm7 \n \t " // 0, 0, 0, QP, 0, 0, 0, QP
@ -3417,7 +3456,7 @@ static void RENAME(postProcess)(const uint8_t src[], int srcStride, uint8_t dst[
# endif
# if HAVE_MMXEXT_INLINE
# if TEMPLATE_PP_MMXEXT
/*
prefetchnta ( srcBlock + ( ( ( x > > 2 ) & 6 ) + 5 ) * srcStride + 32 ) ;
prefetchnta ( srcBlock + ( ( ( x > > 2 ) & 6 ) + 6 ) * srcStride + 32 ) ;
@ -3444,7 +3483,7 @@ static void RENAME(postProcess)(const uint8_t src[], int srcStride, uint8_t dst[
: " % " REG_a , " % " REG_d
) ;
# elif HAVE_AMD3DNOW_INLINE
# elif TEMPLATE_PP_3DNOW
//FIXME check if this is faster on an 3dnow chip or if it is faster without the prefetch or ...
/* prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
prefetch ( srcBlock + ( ( ( x > > 3 ) & 3 ) + 9 ) * srcStride + 32 ) ;
@ -3488,12 +3527,12 @@ static void RENAME(postProcess)(const uint8_t src[], int srcStride, uint8_t dst[
}
}
# if HAVE_MMX_INLINE
# if TEMPLATE_PP_MMX
RENAME ( transpose1 ) ( tempBlock1 , tempBlock2 , dstBlock , dstStride ) ;
# endif
/* check if we have a previous block to deblock it with dstBlock */
if ( x - 8 > = 0 ) {
# if HAVE_MMX_INLINE
# if TEMPLATE_PP_MMX
if ( mode & H_X1_FILTER )
RENAME ( vertX1Filter ) ( tempBlock1 , 16 , & c ) ;
else if ( mode & H_DEBLOCK ) {
@ -3514,7 +3553,7 @@ static void RENAME(postProcess)(const uint8_t src[], int srcStride, uint8_t dst[
if ( mode & H_X1_FILTER )
horizX1Filter ( dstBlock - 4 , stride , QP ) ;
else if ( mode & H_DEBLOCK ) {
# if HAVE _ALTIVEC
# if TEMPLATE_PP _ALTIVEC
DECLARE_ALIGNED ( 16 , unsigned char , tempBlock ) [ 272 ] ;
int t ;
transpose_16x8_char_toPackedAlign_altivec ( tempBlock , dstBlock - ( 4 + 1 ) , stride ) ;
@ -3539,7 +3578,7 @@ static void RENAME(postProcess)(const uint8_t src[], int srcStride, uint8_t dst[
} else if ( mode & H_A_DEBLOCK ) {
RENAME ( do_a_deblock ) ( dstBlock - 8 , 1 , stride , & c ) ;
}
# endif //HAVE_MMX_INLINE
# endif //TEMPLATE_PP_MMX
if ( mode & DERING ) {
//FIXME filter first line
if ( y > 0 ) RENAME ( dering ) ( dstBlock - stride - 8 , stride , & c ) ;
@ -3557,7 +3596,7 @@ static void RENAME(postProcess)(const uint8_t src[], int srcStride, uint8_t dst[
dstBlock + = 8 ;
srcBlock + = 8 ;
# if HAVE_MMX_INLINE
# if TEMPLATE_PP_MMX
tmpXchg = tempBlock1 ;
tempBlock1 = tempBlock2 ;
tempBlock2 = tmpXchg ;
@ -3597,9 +3636,9 @@ static void RENAME(postProcess)(const uint8_t src[], int srcStride, uint8_t dst[
+ dstBlock [ x + 14 * dstStride ] + dstBlock [ x + 15 * dstStride ] ;
} */
}
# if HAVE_AMD3DNOW_INLINE
# if TEMPLATE_PP_3DNOW
__asm__ volatile ( " femms " ) ;
# elif HAVE_MMX_INLINE
# elif TEMPLATE_PP_MMX
__asm__ volatile ( " emms " ) ;
# endif
@ -3629,3 +3668,10 @@ static void RENAME(postProcess)(const uint8_t src[], int srcStride, uint8_t dst[
* c2 = c ; //copy local context back
}
# undef RENAME
# undef TEMPLATE_PP_C
# undef TEMPLATE_PP_ALTIVEC
# undef TEMPLATE_PP_MMX
# undef TEMPLATE_PP_MMXEXT
# undef TEMPLATE_PP_3DNOW