@ -5,6 +5,11 @@
// current version mostly by Michael Niedermayer (michaelni@gmx.at)
// the parts written by michael are under GNU GPL
/* TODO
Move static / global vars into a struct so multiple scalers can be used
write vertical cubic upscale / linear downscale stuff
*/
# undef MOVNTQ
# undef PAVGB
# undef PREFETCH
@ -1154,10 +1159,176 @@ static inline void RENAME(yuv2rgb1)(uint16_t *buf0, uint16_t *buf1, uint16_t *uv
# endif
}
static inline void RENAME ( hyscale ) ( uint16_t * dst , int dstWidth , uint8_t * src , int srcW , int xInc )
// Bilinear / Bicubic scaling
static inline void RENAME ( hScale ) ( int16_t * dst , int dstW , uint8_t * src , int srcW , int xInc ,
int16_t * filter , int16_t * filterPos , int filterSize )
{
# ifdef HAVE_MMX
if ( filterSize = = 4 ) // allways true for upscaling, sometimes for down too
{
int counter = - 2 * dstW ;
filter - = counter * 2 ;
filterPos - = counter / 2 ;
dst - = counter / 2 ;
asm volatile (
" pxor %%mm7, %%mm7 \n \t "
" movq w02, %%mm6 \n \t "
" pushl %%ebp \n \t " // we use 7 regs here ...
" movl %%eax, %%ebp \n \t "
" .balign 16 \n \t "
" 1: \n \t "
" movzwl (%2, %%ebp), %%eax \n \t "
" movzwl 2(%2, %%ebp), %%ebx \n \t "
" movq (%1, %%ebp, 4), %%mm1 \n \t "
" movq 8(%1, %%ebp, 4), %%mm3 \n \t "
" movd (%3, %%eax), %%mm0 \n \t "
" movd (%3, %%ebx), %%mm2 \n \t "
" punpcklbw %%mm7, %%mm0 \n \t "
" punpcklbw %%mm7, %%mm2 \n \t "
" pmaddwd %%mm1, %%mm0 \n \t "
" pmaddwd %%mm2, %%mm3 \n \t "
" psrad $8, %%mm0 \n \t "
" psrad $8, %%mm3 \n \t "
" packssdw %%mm3, %%mm0 \n \t "
" pmaddwd %%mm6, %%mm0 \n \t "
" packssdw %%mm0, %%mm0 \n \t "
" movd %%mm0, (%4, %%ebp) \n \t "
" addl $4, %%ebp \n \t "
" jnc 1b \n \t "
" popl %%ebp \n \t "
: " +a " ( counter )
: " c " ( filter ) , " d " ( filterPos ) , " S " ( src ) , " D " ( dst )
: " %ebx "
) ;
}
else if ( filterSize = = 8 )
{
int counter = - 2 * dstW ;
filter - = counter * 4 ;
filterPos - = counter / 2 ;
dst - = counter / 2 ;
asm volatile (
" pxor %%mm7, %%mm7 \n \t "
" movq w02, %%mm6 \n \t "
" pushl %%ebp \n \t " // we use 7 regs here ...
" movl %%eax, %%ebp \n \t "
" .balign 16 \n \t "
" 1: \n \t "
" movzwl (%2, %%ebp), %%eax \n \t "
" movzwl 2(%2, %%ebp), %%ebx \n \t "
" movq (%1, %%ebp, 8), %%mm1 \n \t "
" movq 16(%1, %%ebp, 8), %%mm3 \n \t "
" movd (%3, %%eax), %%mm0 \n \t "
" movd (%3, %%ebx), %%mm2 \n \t "
" punpcklbw %%mm7, %%mm0 \n \t "
" punpcklbw %%mm7, %%mm2 \n \t "
" pmaddwd %%mm1, %%mm0 \n \t "
" pmaddwd %%mm2, %%mm3 \n \t "
" movq 8(%1, %%ebp, 8), %%mm1 \n \t "
" movq 24(%1, %%ebp, 8), %%mm5 \n \t "
" movd 4(%3, %%eax), %%mm4 \n \t "
" movd 4(%3, %%ebx), %%mm2 \n \t "
" punpcklbw %%mm7, %%mm4 \n \t "
" punpcklbw %%mm7, %%mm2 \n \t "
" pmaddwd %%mm1, %%mm4 \n \t "
" pmaddwd %%mm2, %%mm5 \n \t "
" paddd %%mm4, %%mm0 \n \t "
" paddd %%mm5, %%mm3 \n \t "
" psrad $8, %%mm0 \n \t "
" psrad $8, %%mm3 \n \t "
" packssdw %%mm3, %%mm0 \n \t "
" pmaddwd %%mm6, %%mm0 \n \t "
" packssdw %%mm0, %%mm0 \n \t "
" movd %%mm0, (%4, %%ebp) \n \t "
" addl $4, %%ebp \n \t "
" jnc 1b \n \t "
" popl %%ebp \n \t "
: " +a " ( counter )
: " c " ( filter ) , " d " ( filterPos ) , " S " ( src ) , " D " ( dst )
: " %ebx "
) ;
}
else
{
int counter = - 2 * dstW ;
// filter-= counter*filterSize/2;
filterPos - = counter / 2 ;
dst - = counter / 2 ;
asm volatile (
" pxor %%mm7, %%mm7 \n \t "
" movq w02, %%mm6 \n \t "
" .balign 16 \n \t "
" 1: \n \t "
" movl %2, %%ecx \n \t "
" movzwl (%%ecx, %0), %%eax \n \t "
" movzwl 2(%%ecx, %0), %%ebx \n \t "
" movl %5, %%ecx \n \t "
" pxor %%mm4, %%mm4 \n \t "
" pxor %%mm5, %%mm5 \n \t "
" 2: \n \t "
" movq (%1), %%mm1 \n \t "
" movq (%1, %6), %%mm3 \n \t "
" movd (%%ecx, %%eax), %%mm0 \n \t "
" movd (%%ecx, %%ebx), %%mm2 \n \t "
" punpcklbw %%mm7, %%mm0 \n \t "
" punpcklbw %%mm7, %%mm2 \n \t "
" pmaddwd %%mm1, %%mm0 \n \t "
" pmaddwd %%mm2, %%mm3 \n \t "
" paddd %%mm3, %%mm5 \n \t "
" paddd %%mm0, %%mm4 \n \t "
" addl $8, %1 \n \t "
" addl $4, %%ecx \n \t "
" cmpl %4, %%ecx \n \t "
" jb 2b \n \t "
" addl %6, %1 \n \t "
" psrad $8, %%mm4 \n \t "
" psrad $8, %%mm5 \n \t "
" packssdw %%mm5, %%mm4 \n \t "
" pmaddwd %%mm6, %%mm4 \n \t "
" packssdw %%mm4, %%mm4 \n \t "
" movl %3, %%eax \n \t "
" movd %%mm4, (%%eax, %0) \n \t "
" addl $4, %0 \n \t "
" jnc 1b \n \t "
: " +r " ( counter )
: " r " ( filter ) , " m " ( filterPos ) , " m " ( dst ) , " m " ( src + filterSize ) ,
" m " ( src ) , " r " ( filterSize * 2 )
: " %ebx " , " %eax " , " %ecx " , " %edx "
) ;
}
# else
int i ;
for ( i = 0 ; i < dstW ; i + + )
{
int j ;
int srcPos = filterPos [ i ] ;
int val = 0 ;
// printf("filterPos: %d\n", hFilterPos[i]);
for ( j = 0 ; j < filterSize ; j + + )
{
// printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
val + = ( ( int ) src [ srcPos + j ] ) * filter [ filterSize * i + j ] ;
}
// filter += hFilterSize;
dst [ i ] = MIN ( MAX ( 0 , val > > 7 ) , ( 1 < < 15 ) - 1 ) ; // the cubic equation does overflow ...
// dst[i] = val>>7;
}
# endif
}
// *** horizontal scale Y line to temp buffer
static inline void RENAME ( hyscale ) ( uint16_t * dst , int dstWidth , uint8_t * src , int srcW , int xInc )
{
if ( sws_flags ! = SWS_FAST_BILINEAR )
{
RENAME ( hScale ) ( dst , dstWidth , src , srcW , xInc , hLumFilter , hLumFilterPos , hLumFilterSize ) ;
}
else // Fast Bilinear upscale / crap downscale
{
# ifdef ARCH_X86
# ifdef HAVE_MMX2
int i ;
@ -1267,11 +1438,19 @@ FUNNY_Y_CODE
xpos + = xInc ;
}
# endif
}
}
inline static void RENAME ( hcscale ) ( uint16_t * dst , int dstWidth ,
uint8_t * src1 , uint8_t * src2 , int srcW , int xInc )
{
if ( sws_flags ! = SWS_FAST_BILINEAR )
{
RENAME ( hScale ) ( dst , dstWidth , src1 , srcW , xInc , hChrFilter , hChrFilterPos , hChrFilterSize ) ;
RENAME ( hScale ) ( dst + 2048 , dstWidth , src2 , srcW , xInc , hChrFilter , hChrFilterPos , hChrFilterSize ) ;
}
else // Fast Bilinear upscale / crap downscale
{
# ifdef ARCH_X86
# ifdef HAVE_MMX2
int i ;
@ -1402,6 +1581,162 @@ FUNNYUVCODE
xpos + = xInc ;
}
# endif
}
}
static void inline RENAME ( initFilter ) ( int16_t * filter , int16_t * filterPos , int * filterSize , int xInc ,
int srcW , int dstW )
{
int i ;
# ifdef HAVE_MMX
asm volatile ( " emms \n \t " : : : " memory " ) ; //FIXME this shouldnt be required but it IS
# endif
if ( xInc < = ( 1 < < 16 ) ) // upscale / cubic interpolate
{
int i ;
int xDstInSrc ;
if ( sws_flags = = SWS_BICUBIC ) * filterSize = 4 ;
else * filterSize = 2 ;
// printf("%d %d %d\n", filterSize, srcW, dstW);
# ifdef HAVE_MMX
* filterSize = ( * filterSize + 3 ) & ( ~ 3 ) ; // -> *filterSize %4 == 0
# endif
xDstInSrc = xInc - 0x8000 ;
for ( i = 0 ; i < dstW ; i + + )
{
int xx = ( xDstInSrc > > 16 ) - ( * filterSize > > 1 ) + 1 ;
int j ;
filterPos [ i ] = xx ;
if ( sws_flags = = SWS_BICUBIC )
{
double d = ABS ( ( ( xx + 1 ) < < 16 ) - xDstInSrc ) / ( double ) ( 1 < < 16 ) ;
// int coeff;
int y1 , y2 , y3 , y4 ;
double A = - 0.75 ;
// Equation is from VirtualDub
y1 = ( int ) floor ( 0.5 + ( + A * d - 2.0 * A * d * d + A * d * d * d ) * 16384.0 ) ;
y2 = ( int ) floor ( 0.5 + ( + 1.0 - ( A + 3.0 ) * d * d + ( A + 2.0 ) * d * d * d ) * 16384.0 ) ;
y3 = ( int ) floor ( 0.5 + ( - A * d + ( 2.0 * A + 3.0 ) * d * d - ( A + 2.0 ) * d * d * d ) * 16384.0 ) ;
y4 = ( int ) floor ( 0.5 + ( + A * d * d - A * d * d * d ) * 16384.0 ) ;
// printf("%d %d %d \n", coeff, (int)d, xDstInSrc);
filter [ i * ( * filterSize ) + 0 ] = y1 ;
filter [ i * ( * filterSize ) + 1 ] = y2 ;
filter [ i * ( * filterSize ) + 2 ] = y3 ;
filter [ i * ( * filterSize ) + 3 ] = y4 ;
// printf("%1.3f %d, %d, %d, %d\n",d , y1, y2, y3, y4);
}
else
{
for ( j = 0 ; j < * filterSize ; j + + )
{
double d = ABS ( ( xx < < 16 ) - xDstInSrc ) / ( double ) ( 1 < < 16 ) ;
int coeff ;
coeff = ( int ) ( 0.5 + ( 1.0 - d ) * ( 1 < < 14 ) ) ;
if ( coeff < 0 ) coeff = 0 ;
// printf("%d %d %d \n", coeff, (int)d, xDstInSrc);
filter [ i * ( * filterSize ) + j ] = coeff ;
xx + + ;
}
}
xDstInSrc + = xInc ;
}
}
else // downscale
{
int xDstInSrc ;
if ( sws_flags = = SWS_BICUBIC ) * filterSize = ( int ) ceil ( 1 + 4.0 * srcW / ( double ) dstW ) ;
else * filterSize = ( int ) ceil ( 1 + 2.0 * srcW / ( double ) dstW ) ;
// printf("%d %d %d\n", *filterSize, srcW, dstW);
# ifdef HAVE_MMX
* filterSize = ( * filterSize + 3 ) & ( ~ 3 ) ; // -> *filterSize %4 == 0
# endif
xDstInSrc = xInc - 0x8000 ;
for ( i = 0 ; i < dstW ; i + + )
{
int xx = ( int ) ( ( double ) xDstInSrc / ( double ) ( 1 < < 16 ) - * filterSize * 0.5 + 0.5 ) ;
int j ;
filterPos [ i ] = xx ;
for ( j = 0 ; j < * filterSize ; j + + )
{
double d = ABS ( ( xx < < 16 ) - xDstInSrc ) / ( double ) xInc ;
int coeff ;
if ( sws_flags = = SWS_BICUBIC )
{
double A = - 0.75 ;
// d*=2;
// Equation is from VirtualDub
if ( d < 1.0 )
coeff = ( int ) floor ( 0.5 + ( 1.0 - ( A + 3.0 ) * d * d
+ ( A + 2.0 ) * d * d * d ) * ( 1 < < 14 ) ) ;
else if ( d < 2.0 )
coeff = ( int ) floor ( 0.5 + ( - 4.0 * A + 8.0 * A * d
- 5.0 * A * d * d + A * d * d * d ) * ( 1 < < 14 ) ) ;
else
coeff = 0 ;
}
else
{
coeff = ( int ) ( 0.5 + ( 1.0 - d ) * ( 1 < < 14 ) ) ;
if ( coeff < 0 ) coeff = 0 ;
}
// printf("%d %d %d \n", coeff, (int)d, xDstInSrc);
filter [ i * ( * filterSize ) + j ] = coeff ;
xx + + ;
}
xDstInSrc + = xInc ;
}
}
//fix borders
for ( i = 0 ; i < dstW ; i + + )
{
int j ;
if ( filterPos [ i ] < 0 )
{
// Move filter coeffs left to compensate for filterPos
for ( j = 1 ; j < * filterSize ; j + + )
{
int left = MAX ( j + filterPos [ i ] , 0 ) ;
filter [ i * ( * filterSize ) + left ] + = filter [ i * ( * filterSize ) + j ] ;
filter [ i * ( * filterSize ) + j ] = 0 ;
}
filterPos [ i ] = 0 ;
}
if ( filterPos [ i ] + * filterSize > srcW )
{
int shift = filterPos [ i ] + * filterSize - srcW ;
// Move filter coeffs right to compensate for filterPos
for ( j = * filterSize - 2 ; j > = 0 ; j - - )
{
int right = MIN ( j + shift , * filterSize - 1 ) ;
filter [ i * ( * filterSize ) + right ] + = filter [ i * ( * filterSize ) + j ] ;
filter [ i * ( * filterSize ) + j ] = 0 ;
}
filterPos [ i ] = srcW - * filterSize ;
}
}
//Normalize
for ( i = 0 ; i < dstW ; i + + )
{
int j ;
double sum = 0 ;
double scale = 1 < < 14 ;
for ( j = 0 ; j < * filterSize ; j + + )
{
sum + = filter [ i * ( * filterSize ) + j ] ;
}
scale / = sum ;
for ( j = 0 ; j < * filterSize ; j + + )
{
filter [ i * ( * filterSize ) + j ] = ( int ) ( filter [ i * ( * filterSize ) + j ] * scale ) ;
}
}
}
static void RENAME ( SwScale_YV12slice ) ( unsigned char * srcptr [ ] , int stride [ ] , int srcSliceY ,
@ -1421,11 +1756,8 @@ static int dstY;
static int lastLumSrcY ;
static int lastChrSrcY ;
# ifdef HAVE_MMX2
// used to detect a horizontal size change
static int old_dstW = - 1 ;
static int old_s_xinc = - 1 ;
# endif
static int oldDstW = - 1 ;
static int oldSrcW = - 1 ;
int dstUVw ;
int i ;
@ -1469,10 +1801,19 @@ else s_xinc2= s_xinc;
= pix_buf_uv [ 0 ] [ 2048 + i / 2 ] = pix_buf_uv [ 1 ] [ 2048 + i / 2 ] = 128 * 128 ;
pix_buf_y [ 0 ] [ i ] = pix_buf_y [ 1 ] [ i ] = 0 ;
}
//precalculate horizontal scaler filter coefficients
if ( oldDstW ! = dstW | | oldSrcW ! = srcW )
{
// int i;
oldDstW = dstW ; oldSrcW = srcW ;
RENAME ( initFilter ) ( hLumFilter , hLumFilterPos , & hLumFilterSize , s_xinc , srcW , dstW ) ;
RENAME ( initFilter ) ( hChrFilter , hChrFilterPos , & hChrFilterSize , s_xinc2 , srcW , dstW ) ;
# ifdef HAVE_MMX2
// cant downscale !!!
if ( ( old_s_xinc ! = s_xinc | | old_dstW ! = dstW ) & & canMMX2BeUsed )
if ( canMMX2BeUsed )
{
uint8_t * fragment ;
int imm8OfPShufW1 ;
@ -1481,9 +1822,6 @@ else s_xinc2= s_xinc;
int xpos , i ;
old_s_xinc = s_xinc ;
old_dstW = dstW ;
// create an optimized horizontal scaling routine
//code fragment
@ -1532,20 +1870,6 @@ else s_xinc2= s_xinc;
xpos = 0 ; //s_xinc/2 - 0x8000; // difference between pixel centers
/* choose xinc so that all 8 parts fit exactly
Note : we cannot use just 1 part because it would not fit in the code cache */
// s_xinc2_diff= -((((s_xinc2*(dstW/8))&0xFFFF))/(dstW/8))-10;
// s_xinc_diff= -((((s_xinc*(dstW/8))&0xFFFF))/(dstW/8));
# ifdef ALT_ERROR
// s_xinc2_diff+= ((0x10000/(dstW/8)));
# endif
// s_xinc_diff= s_xinc2_diff*2;
// s_xinc2+= s_xinc2_diff;
// s_xinc+= s_xinc_diff;
// old_s_xinc= s_xinc;
for ( i = 0 ; i < dstW / 8 ; i + + )
{
int xx = xpos > > 16 ;
@ -1602,6 +1926,7 @@ else s_xinc2= s_xinc;
}
# endif // HAVE_MMX2
} // Init stuff
} // reset counters
while ( 1 ) {