@ -68,12 +68,12 @@ void ff_simple_idct_neon(DCTELEM *data);
void ff_simple_idct_axp ( DCTELEM * data ) ;
struct algo {
const char * name ;
enum { FDCT , IDCT } is_idct ;
void ( * func ) ( DCTELEM * block ) ;
void ( * ref ) ( DCTELEM * block ) ;
enum formattag { NO_PERM , MMX_PERM , MMX_SIMPLE_PERM , SCALE_PERM , SSE2_PERM , PARTTRANS_PERM } forma t;
int mm_support ;
const char * name ;
void ( * func ) ( DCTELEM * block ) ;
enum formattag { NO_PERM , MMX_PERM , MMX_SIMPLE_PERM , SCALE_PERM ,
SSE2_PERM , PARTTRANS_PERM } format ;
int mm_suppor t ;
int nonspec ;
} ;
# ifndef FAAN_POSTSCALE
@ -84,61 +84,69 @@ struct algo {
static int cpu_flags ;
struct algo algos [ ] = {
{ " REF-DBL " , 0 , ff_ref_fdct , ff_ref_fdct , NO_PERM } ,
{ " FAAN " , 0 , ff_faandct , ff_ref_fdct , FAAN_SCALE } ,
{ " FAANI " , 1 , ff_faanidct , ff_ref_idct , NO_PERM } ,
{ " IJG-AAN-INT " , 0 , fdct_ifast , ff_ref_fdct , SCALE_PERM } ,
{ " IJG-LLM-INT " , 0 , ff_jpeg_fdct_islow , ff_ref_fdct , NO_PERM } ,
{ " REF-DBL " , 1 , ff_ref_idct , ff_ref_idct , NO_PERM } ,
{ " INT " , 1 , j_rev_dct , ff_ref_idct , MMX_PERM } ,
{ " SIMPLE-C " , 1 , ff_simple_idct , ff_ref_idct , NO_PERM } ,
static const struct algo fdct_tab [ ] = {
{ " REF-DBL " , ff_ref_fdct , NO_PERM } ,
{ " FAAN " , ff_faandct , FAAN_SCALE } ,
{ " IJG-AAN-INT " , fdct_ifast , SCALE_PERM } ,
{ " IJG-LLM-INT " , ff_jpeg_fdct_islow , NO_PERM } ,
# if HAVE_MMX
{ " MMX " , 0 , ff_fdct_mmx , ff_ref_fdct , NO_PERM , AV_CPU_FLAG_MMX } ,
# if HAVE_MMX2
{ " MMX2 " , 0 , ff_fdct_mmx2 , ff_ref_fdct , NO_PERM , AV_CPU_FLAG_MMX2 } ,
{ " SSE2 " , 0 , ff_fdct_sse2 , ff_ref_fdct , NO_PERM , AV_CPU_FLAG_SSE2 } ,
{ " MMX " , ff_fdct_mmx , NO_PERM , AV_CPU_FLAG_MMX } ,
{ " MMX2 " , ff_fdct_mmx2 , NO_PERM , AV_CPU_FLAG_MMX2 } ,
{ " SSE2 " , ff_fdct_sse2 , NO_PERM , AV_CPU_FLAG_SSE2 } ,
# endif
# if CONFIG_GPL
{ " LIBMPEG2-MMX " , 1 , ff_mmx_idct , ff_ref_idct , MMX_PERM , AV_CPU_FLAG_MMX } ,
{ " LIBMPEG2-MMX2 " , 1 , ff_mmxext_idct , ff_ref_idct , MMX_PERM , AV_CPU_FLAG_MMX2 } ,
# if HAVE_ALTIVEC
{ " altivecfdct " , fdct_altivec , NO_PERM , AV_CPU_FLAG_ALTIVEC } ,
# endif
{ " SIMPLE-MMX " , 1 , ff_simple_idct_mmx , ff_ref_idct , MMX_SIMPLE_PERM , AV_CPU_FLAG_MMX } ,
{ " XVID-MMX " , 1 , ff_idct_xvid_mmx , ff_ref_idct , NO_PERM , AV_CPU_FLAG_MMX } ,
{ " XVID-MMX2 " , 1 , ff_idct_xvid_mmx2 , ff_ref_idct , NO_PERM , AV_CPU_FLAG_MMX2 } ,
{ " XVID-SSE2 " , 1 , ff_idct_xvid_sse2 , ff_ref_idct , SSE2_PERM , AV_CPU_FLAG_SSE2 } ,
# if ARCH_BFIN
{ " BFINfdct " , ff_bfin_fdct , NO_PERM } ,
# endif
# if HAVE_ALTIVEC
{ " altivecfdct " , 0 , fdct_altivec , ff_ref_fdct , NO_PERM , AV_CPU_FLAG_ALTIVEC } ,
{ 0 }
} ;
static const struct algo idct_tab [ ] = {
{ " FAANI " , ff_faanidct , NO_PERM } ,
{ " REF-DBL " , ff_ref_idct , NO_PERM } ,
{ " INT " , j_rev_dct , MMX_PERM } ,
{ " SIMPLE-C " , ff_simple_idct , NO_PERM } ,
# if HAVE_MMX
# if CONFIG_GPL
{ " LIBMPEG2-MMX " , ff_mmx_idct , MMX_PERM , AV_CPU_FLAG_MMX , 1 } ,
{ " LIBMPEG2-MMX2 " , ff_mmxext_idct , MMX_PERM , AV_CPU_FLAG_MMX2 , 1 } ,
# endif
{ " SIMPLE-MMX " , ff_simple_idct_mmx , MMX_SIMPLE_PERM , AV_CPU_FLAG_MMX } ,
{ " XVID-MMX " , ff_idct_xvid_mmx , NO_PERM , AV_CPU_FLAG_MMX , 1 } ,
{ " XVID-MMX2 " , ff_idct_xvid_mmx2 , NO_PERM , AV_CPU_FLAG_MMX2 , 1 } ,
{ " XVID-SSE2 " , ff_idct_xvid_sse2 , SSE2_PERM , AV_CPU_FLAG_SSE2 , 1 } ,
# endif
# if ARCH_BFIN
{ " BFINfdct " , 0 , ff_bfin_fdct , ff_ref_fdct , NO_PERM } ,
{ " BFINidct " , 1 , ff_bfin_idct , ff_ref_idct , NO_PERM } ,
{ " BFINidct " , ff_bfin_idct , NO_PERM } ,
# endif
# if ARCH_ARM
{ " SIMPLE-ARM " , 1 , ff_simple_idct_arm , ff_ref_idct , NO_PERM } ,
{ " INT-ARM " , 1 , ff_j_rev_dct_arm , ff_ref_idct , MMX_PERM } ,
{ " SIMPLE-ARM " , ff_simple_idct_arm , NO_PERM } ,
{ " INT-ARM " , ff_j_rev_dct_arm , MMX_PERM } ,
# endif
# if HAVE_ARMV5TE
{ " SIMPLE-ARMV5TE " , 1 , ff_simple_idct_armv5te , ff_ref_idct , NO_PERM } ,
{ " SIMPLE-ARMV5TE " , ff_simple_idct_armv5te , NO_PERM } ,
# endif
# if HAVE_ARMV6
{ " SIMPLE-ARMV6 " , 1 , ff_simple_idct_armv6 , ff_ref_idct , MMX_PERM } ,
{ " SIMPLE-ARMV6 " , ff_simple_idct_armv6 , MMX_PERM } ,
# endif
# if HAVE_NEON
{ " SIMPLE-NEON " , 1 , ff_simple_idct_neon , ff_ref_idct , PARTTRANS_PERM } ,
{ " SIMPLE-NEON " , ff_simple_idct_neon , PARTTRANS_PERM } ,
# endif
# endif /* ARCH_ARM */
# if ARCH_ALPHA
{ " SIMPLE-ALPHA " , 1 , ff_simple_idct_axp , ff_ref_idct , NO_PERM } ,
{ " SIMPLE-ALPHA " , ff_simple_idct_axp , NO_PERM } ,
# endif
{ 0 }
{ 0 }
} ;
# define AANSCALE_BITS 12
@ -148,7 +156,7 @@ uint8_t cropTbl[256 + 2 * MAX_NEG_CROP];
static int64_t gettime ( void )
{
struct timeval tv ;
gettimeofday ( & tv , NULL ) ;
gettimeofday ( & tv , NULL ) ;
return ( int64_t ) tv . tv_sec * 1000000 + tv . tv_usec ;
}
@ -157,18 +165,18 @@ static int64_t gettime(void)
static short idct_mmx_perm [ 64 ] ;
static short idct_simple_mmx_perm [ 64 ] = {
0x00 , 0x08 , 0x04 , 0x09 , 0x01 , 0x0C , 0x05 , 0x0D ,
0x10 , 0x18 , 0x14 , 0x19 , 0x11 , 0x1C , 0x15 , 0x1D ,
0x20 , 0x28 , 0x24 , 0x29 , 0x21 , 0x2C , 0x25 , 0x2D ,
0x12 , 0x1A , 0x16 , 0x1B , 0x13 , 0x1E , 0x17 , 0x1F ,
0x02 , 0x0A , 0x06 , 0x0B , 0x03 , 0x0E , 0x07 , 0x0F ,
0x30 , 0x38 , 0x34 , 0x39 , 0x31 , 0x3C , 0x35 , 0x3D ,
0x22 , 0x2A , 0x26 , 0x2B , 0x23 , 0x2E , 0x27 , 0x2F ,
0x32 , 0x3A , 0x36 , 0x3B , 0x33 , 0x3E , 0x37 , 0x3F ,
static short idct_simple_mmx_perm [ 64 ] = {
0x00 , 0x08 , 0x04 , 0x09 , 0x01 , 0x0C , 0x05 , 0x0D ,
0x10 , 0x18 , 0x14 , 0x19 , 0x11 , 0x1C , 0x15 , 0x1D ,
0x20 , 0x28 , 0x24 , 0x29 , 0x21 , 0x2C , 0x25 , 0x2D ,
0x12 , 0x1A , 0x16 , 0x1B , 0x13 , 0x1E , 0x17 , 0x1F ,
0x02 , 0x0A , 0x06 , 0x0B , 0x03 , 0x0E , 0x07 , 0x0F ,
0x30 , 0x38 , 0x34 , 0x39 , 0x31 , 0x3C , 0x35 , 0x3D ,
0x22 , 0x2A , 0x26 , 0x2B , 0x23 , 0x2E , 0x27 , 0x2F ,
0x32 , 0x3A , 0x36 , 0x3B , 0x33 , 0x3E , 0x37 , 0x3F ,
} ;
static const uint8_t idct_sse2_row_perm [ 8 ] = { 0 , 4 , 1 , 5 , 2 , 6 , 3 , 7 } ;
static const uint8_t idct_sse2_row_perm [ 8 ] = { 0 , 4 , 1 , 5 , 2 , 6 , 3 , 7 } ;
static void idct_mmx_init ( void )
{
@ -177,13 +185,12 @@ static void idct_mmx_init(void)
/* the mmx/mmxext idct uses a reordered input, so we patch scan tables */
for ( i = 0 ; i < 64 ; i + + ) {
idct_mmx_perm [ i ] = ( i & 0x38 ) | ( ( i & 6 ) > > 1 ) | ( ( i & 1 ) < < 2 ) ;
// idct_simple_mmx_perm[i] = simple_block_permute_op(i);
}
}
DECLARE_ALIGNED ( 16 , static DCTELEM , block ) [ 64 ] ;
DECLARE_ALIGNED ( 8 , static DCTELEM , block1 ) [ 64 ] ;
DECLARE_ALIGNED ( 8 , static DCTELEM , block_org ) [ 64 ] ;
DECLARE_ALIGNED ( 8 , static DCTELEM , block1 ) [ 64 ] ;
DECLARE_ALIGNED ( 8 , static DCTELEM , block_org ) [ 64 ] ;
static inline void mmx_emms ( void )
{
@ -193,188 +200,177 @@ static inline void mmx_emms(void)
# endif
}
static void dct_error ( const char * name , int is_idct ,
void ( * fdct_func ) ( DCTELEM * block ) ,
void ( * fdct_ref ) ( DCTELEM * block ) , int form , int test , const int bits )
static int dct_error ( const struct algo * dct , int test , int is_idct , int speed , const int bits )
{
void ( * ref ) ( DCTELEM * block ) = is_idct ? ff_ref_idct : ff_ref_fdct ;
int it , i , scale ;
int err_inf , v ;
int64_t err2 , ti , ti1 , it1 ;
int64_t sysErr [ 64 ] , sysErrMax = 0 ;
int maxout = 0 ;
int blockSumErrMax = 0 , blockSumErr ;
int64_t err2 , ti , ti1 , it1 , err_sum = 0 ;
int64_t sysErr [ 64 ] , sysErrMax = 0 ;
int maxout = 0 ;
int blockSumErrMax = 0 , blockSumErr ;
AVLFG prng ;
const int vals = 1 < < bits ;
double omse , ome ;
int spec_err ;
av_lfg_init ( & prng , 1 ) ;
err_inf = 0 ;
err2 = 0 ;
for ( i = 0 ; i < 64 ; i + + ) sysErr [ i ] = 0 ;
for ( it = 0 ; it < NB_ITS ; it + + ) {
for ( i = 0 ; i < 64 ; i + + )
for ( i = 0 ; i < 64 ; i + + )
sysErr [ i ] = 0 ;
for ( it = 0 ; it < NB_ITS ; it + + ) {
for ( i = 0 ; i < 64 ; i + + )
block1 [ i ] = 0 ;
switch ( test ) {
switch ( test ) {
case 0 :
for ( i = 0 ; i < 64 ; i + + )
for ( i = 0 ; i < 64 ; i + + )
block1 [ i ] = ( av_lfg_get ( & prng ) % ( 2 * vals ) ) - vals ;
if ( is_idct ) {
if ( is_idct ) {
ff_ref_fdct ( block1 ) ;
for ( i = 0 ; i < 64 ; i + + )
block1 [ i ] > > = 3 ;
for ( i = 0 ; i < 64 ; i + + )
block1 [ i ] > > = 3 ;
}
break ;
case 1 : {
int num = av_lfg_get ( & prng ) % 10 + 1 ;
for ( i = 0 ; i < num ; i + + )
block1 [ av_lfg_get ( & prng ) % 64 ] = av_lfg_get ( & prng ) % ( 2 * vals ) - vals ;
} break ;
break ;
case 1 : {
int num = av_lfg_get ( & prng ) % 10 + 1 ;
for ( i = 0 ; i < num ; i + + )
block1 [ av_lfg_get ( & prng ) % 64 ] = av_lfg_get ( & prng ) % ( 2 * vals ) - vals ;
}
break ;
case 2 :
block1 [ 0 ] = av_lfg_get ( & prng ) % ( 16 * vals ) - ( 8 * vals ) ;
block1 [ 63 ] = ( block1 [ 0 ] & 1 ) ^ 1 ;
break ;
block1 [ 63 ] = ( block1 [ 0 ] & 1 ) ^ 1 ;
break ;
}
#if 0 // simulate mismatch control
{ int sum = 0 ;
for ( i = 0 ; i < 64 ; i + + )
sum + = block1 [ i ] ;
if ( ( sum & 1 ) = = 0 ) block1 [ 63 ] ^ = 1 ;
}
# endif
for ( i = 0 ; i < 64 ; i + + )
block_org [ i ] = block1 [ i ] ;
for ( i = 0 ; i < 64 ; i + + )
block_org [ i ] = block1 [ i ] ;
if ( form = = MMX_PERM ) {
for ( i = 0 ; i < 64 ; i + + )
if ( dct - > format = = MMX_PERM ) {
for ( i = 0 ; i < 64 ; i + + )
block [ idct_mmx_perm [ i ] ] = block1 [ i ] ;
} else if ( form = = MMX_SIMPLE_PERM ) {
for ( i = 0 ; i < 64 ; i + + )
} else if ( dct - > format = = MMX_SIMPLE_PERM ) {
for ( i = 0 ; i < 64 ; i + + )
block [ idct_simple_mmx_perm [ i ] ] = block1 [ i ] ;
} else if ( form = = SSE2_PERM ) {
for ( i = 0 ; i < 64 ; i + + )
block [ ( i & 0x38 ) | idct_sse2_row_perm [ i & 7 ] ] = block1 [ i ] ;
} else if ( form = = PARTTRANS_PERM ) {
for ( i = 0 ; i < 64 ; i + + )
block [ ( i & 0x24 ) | ( ( i & 3 ) < < 3 ) | ( ( i > > 3 ) & 3 ) ] = block1 [ i ] ;
} else if ( dct - > format = = SSE2_PERM ) {
for ( i = 0 ; i < 64 ; i + + )
block [ ( i & 0x38 ) | idct_sse2_row_perm [ i & 7 ] ] = block1 [ i ] ;
} else if ( dct - > format = = PARTTRANS_PERM ) {
for ( i = 0 ; i < 64 ; i + + )
block [ ( i & 0x24 ) | ( ( i & 3 ) < < 3 ) | ( ( i > > 3 ) & 3 ) ] = block1 [ i ] ;
} else {
for ( i = 0 ; i < 64 ; i + + )
block [ i ] = block1 [ i ] ;
for ( i = 0 ; i < 64 ; i + + )
block [ i ] = block1 [ i ] ;
}
#if 0 // simulate mismatch control for tested IDCT but not the ref
{ int sum = 0 ;
for ( i = 0 ; i < 64 ; i + + )
sum + = block [ i ] ;
if ( ( sum & 1 ) = = 0 ) block [ 63 ] ^ = 1 ;
}
# endif
fdct_ func( block ) ;
dct - > func ( block ) ;
mmx_emms ( ) ;
if ( form = = SCALE_PERM ) {
for ( i = 0 ; i < 64 ; i + + ) {
scale = 8 * ( 1 < < ( AANSCALE_BITS + 11 ) ) / ff_aanscales [ i ] ;
block [ i ] = ( block [ i ] * scale /*+ (1<<(AANSCALE_BITS-1))*/ ) > > AANSCALE_BITS ;
if ( dct - > format = = SCALE_PERM ) {
for ( i = 0 ; i < 64 ; i + + ) {
scale = 8 * ( 1 < < ( AANSCALE_BITS + 11 ) ) / ff_aanscales [ i ] ;
block [ i ] = ( block [ i ] * scale ) > > AANSCALE_BITS ;
}
}
fdct_ ref( block1 ) ;
ref ( block1 ) ;
blockSumErr = 0 ;
for ( i = 0 ; i < 64 ; i + + ) {
v = abs ( block [ i ] - block1 [ i ] ) ;
blockSumErr = 0 ;
for ( i = 0 ; i < 64 ; i + + ) {
int err = block [ i ] - block1 [ i ] ;
err_sum + = err ;
v = abs ( err ) ;
if ( v > err_inf )
err_inf = v ;
err2 + = v * v ;
sysErr [ i ] + = block [ i ] - block1 [ i ] ;
blockSumErr + = v ;
if ( abs ( block [ i ] ) > maxout ) maxout = abs ( block [ i ] ) ;
if ( abs ( block [ i ] ) > maxout )
maxout = abs ( block [ i ] ) ;
}
if ( blockSumErrMax < blockSumErr ) blockSumErrMax = blockSumErr ;
#if 0 // print different matrix pairs
if ( blockSumErr ) {
printf ( " \n " ) ;
for ( i = 0 ; i < 64 ; i + + ) {
if ( ( i & 7 ) = = 0 ) printf ( " \n " ) ;
printf ( " %4d " , block_org [ i ] ) ;
}
for ( i = 0 ; i < 64 ; i + + ) {
if ( ( i & 7 ) = = 0 ) printf ( " \n " ) ;
printf ( " %4d " , block [ i ] - block1 [ i ] ) ;
}
}
# endif
if ( blockSumErrMax < blockSumErr )
blockSumErrMax = blockSumErr ;
}
for ( i = 0 ; i < 64 ; i + + ) sysErrMax = FFMAX ( sysErrMax , FFABS ( sysErr [ i ] ) ) ;
for ( i = 0 ; i < 64 ; i + + )
sysErrMax = FFMAX ( sysErrMax , FFABS ( sysErr [ i ] ) ) ;
for ( i = 0 ; i < 64 ; i + + ) {
if ( i % 8 = = 0 ) printf ( " \n " ) ;
printf ( " %7d " , ( int ) sysErr [ i ] ) ;
for ( i = 0 ; i < 64 ; i + + ) {
if ( i % 8 = = 0 )
printf ( " \n " ) ;
printf ( " %7d " , ( int ) sysErr [ i ] ) ;
}
printf ( " \n " ) ;
printf ( " %s %s: err_inf=%d err2=%0.8f syserr=%0.8f maxout=%d blockSumErr=%d \n " ,
is_idct ? " IDCT " : " DCT " ,
name , err_inf , ( double ) err2 / NB_ITS / 64.0 , ( double ) sysErrMax / NB_ITS , maxout , blockSumErrMax ) ;
omse = ( double ) err2 / NB_ITS / 64 ;
ome = ( double ) err_sum / NB_ITS / 64 ;
spec_err = is_idct & & ( err_inf > 1 | | omse > 0.02 | | fabs ( ome ) > 0.0015 ) ;
printf ( " %s %s: ppe=%d omse=%0.8f ome=%0.8f syserr=%0.8f maxout=%d blockSumErr=%d \n " ,
is_idct ? " IDCT " : " DCT " , dct - > name , err_inf ,
omse , ome , ( double ) sysErrMax / NB_ITS ,
maxout , blockSumErrMax ) ;
if ( spec_err & & ! dct - > nonspec )
return 1 ;
if ( ! speed )
return 0 ;
/* speed test */
for ( i = 0 ; i < 64 ; i + + )
for ( i = 0 ; i < 64 ; i + + )
block1 [ i ] = 0 ;
switch ( test ) {
switch ( test ) {
case 0 :
for ( i = 0 ; i < 64 ; i + + )
for ( i = 0 ; i < 64 ; i + + )
block1 [ i ] = av_lfg_get ( & prng ) % ( 2 * vals ) - vals ;
if ( is_idct ) {
if ( is_idct ) {
ff_ref_fdct ( block1 ) ;
for ( i = 0 ; i < 64 ; i + + )
block1 [ i ] > > = 3 ;
for ( i = 0 ; i < 64 ; i + + )
block1 [ i ] > > = 3 ;
}
break ;
case 1 : {
break ;
case 1 :
case 2 :
block1 [ 0 ] = av_lfg_get ( & prng ) % ( 2 * vals ) - vals ;
block1 [ 1 ] = av_lfg_get ( & prng ) % ( 2 * vals ) - vals ;
block1 [ 2 ] = av_lfg_get ( & prng ) % ( 2 * vals ) - vals ;
block1 [ 3 ] = av_lfg_get ( & prng ) % ( 2 * vals ) - vals ;
} break ;
break ;
}
if ( form = = MMX_PERM ) {
for ( i = 0 ; i < 64 ; i + + )
if ( dct - > format = = MMX_PERM ) {
for ( i = 0 ; i < 64 ; i + + )
block [ idct_mmx_perm [ i ] ] = block1 [ i ] ;
} else if ( form = = MMX_SIMPLE_PERM ) {
for ( i = 0 ; i < 64 ; i + + )
} else if ( dct - > format = = MMX_SIMPLE_PERM ) {
for ( i = 0 ; i < 64 ; i + + )
block [ idct_simple_mmx_perm [ i ] ] = block1 [ i ] ;
} else {
for ( i = 0 ; i < 64 ; i + + )
block [ i ] = block1 [ i ] ;
for ( i = 0 ; i < 64 ; i + + )
block [ i ] = block1 [ i ] ;
}
ti = gettime ( ) ;
it1 = 0 ;
do {
for ( it = 0 ; it < NB_ITS_SPEED ; it + + ) {
for ( i = 0 ; i < 64 ; i + + )
block [ i ] = block1 [ i ] ;
// memcpy(block, block1, sizeof(DCTELEM) * 64);
// do not memcpy especially not fastmemcpy because it does movntq !!!
fdct_func ( block ) ;
for ( it = 0 ; it < NB_ITS_SPEED ; it + + ) {
for ( i = 0 ; i < 64 ; i + + )
block [ i ] = block1 [ i ] ;
dct - > func ( block ) ;
}
it1 + = NB_ITS_SPEED ;
ti1 = gettime ( ) - ti ;
} while ( ti1 < 1000000 ) ;
mmx_emms ( ) ;
printf ( " %s %s: %0.1f kdct/s \n " ,
is_idct ? " IDCT " : " DCT " ,
name , ( double ) it1 * 1000.0 / ( double ) ti1 ) ;
printf ( " %s %s: %0.1f kdct/s \n " , is_idct ? " IDCT " : " DCT " , dct - > name ,
( double ) it1 * 1000.0 / ( double ) ti1 ) ;
return 0 ;
}
DECLARE_ALIGNED ( 8 , static uint8_t , img_dest ) [ 64 ] ;
@ -392,19 +388,19 @@ static void idct248_ref(uint8_t *dest, int linesize, int16_t *block)
if ( ! init ) {
init = 1 ;
for ( i = 0 ; i < 8 ; i + + ) {
for ( i = 0 ; i < 8 ; i + + ) {
sum = 0 ;
for ( j = 0 ; j < 8 ; j + + ) {
s = ( i = = 0 ) ? sqrt ( 1.0 / 8.0 ) : sqrt ( 1.0 / 4.0 ) ;
for ( j = 0 ; j < 8 ; j + + ) {
s = ( i = = 0 ) ? sqrt ( 1.0 / 8.0 ) : sqrt ( 1.0 / 4.0 ) ;
c8 [ i ] [ j ] = s * cos ( M_PI * i * ( j + 0.5 ) / 8.0 ) ;
sum + = c8 [ i ] [ j ] * c8 [ i ] [ j ] ;
}
}
for ( i = 0 ; i < 4 ; i + + ) {
for ( i = 0 ; i < 4 ; i + + ) {
sum = 0 ;
for ( j = 0 ; j < 4 ; j + + ) {
s = ( i = = 0 ) ? sqrt ( 1.0 / 4.0 ) : sqrt ( 1.0 / 2.0 ) ;
for ( j = 0 ; j < 4 ; j + + ) {
s = ( i = = 0 ) ? sqrt ( 1.0 / 4.0 ) : sqrt ( 1.0 / 2.0 ) ;
c4 [ i ] [ j ] = s * cos ( M_PI * i * ( j + 0.5 ) / 4.0 ) ;
sum + = c4 [ i ] [ j ] * c4 [ i ] [ j ] ;
}
@ -413,58 +409,59 @@ static void idct248_ref(uint8_t *dest, int linesize, int16_t *block)
/* butterfly */
s = 0.5 * sqrt ( 2.0 ) ;
for ( i = 0 ; i < 4 ; i + + ) {
for ( j = 0 ; j < 8 ; j + + ) {
block1 [ 8 * ( 2 * i ) + j ] = ( block [ 8 * ( 2 * i ) + j ] + block [ 8 * ( 2 * i + 1 ) + j ] ) * s ;
block1 [ 8 * ( 2 * i + 1 ) + j ] = ( block [ 8 * ( 2 * i ) + j ] - block [ 8 * ( 2 * i + 1 ) + j ] ) * s ;
for ( i = 0 ; i < 4 ; i + + ) {
for ( j = 0 ; j < 8 ; j + + ) {
block1 [ 8 * ( 2 * i ) + j ] =
( block [ 8 * ( 2 * i ) + j ] + block [ 8 * ( 2 * i + 1 ) + j ] ) * s ;
block1 [ 8 * ( 2 * i + 1 ) + j ] =
( block [ 8 * ( 2 * i ) + j ] - block [ 8 * ( 2 * i + 1 ) + j ] ) * s ;
}
}
/* idct8 on lines */
for ( i = 0 ; i < 8 ; i + + ) {
for ( j = 0 ; j < 8 ; j + + ) {
for ( i = 0 ; i < 8 ; i + + ) {
for ( j = 0 ; j < 8 ; j + + ) {
sum = 0 ;
for ( k = 0 ; k < 8 ; k + + )
sum + = c8 [ k ] [ j ] * block1 [ 8 * i + k ] ;
block2 [ 8 * i + j ] = sum ;
for ( k = 0 ; k < 8 ; k + + )
sum + = c8 [ k ] [ j ] * block1 [ 8 * i + k ] ;
block2 [ 8 * i + j ] = sum ;
}
}
/* idct4 */
for ( i = 0 ; i < 8 ; i + + ) {
for ( j = 0 ; j < 4 ; j + + ) {
for ( i = 0 ; i < 8 ; i + + ) {
for ( j = 0 ; j < 4 ; j + + ) {
/* top */
sum = 0 ;
for ( k = 0 ; k < 4 ; k + + )
sum + = c4 [ k ] [ j ] * block2 [ 8 * ( 2 * k ) + i ] ;
block3 [ 8 * ( 2 * j ) + i ] = sum ;
for ( k = 0 ; k < 4 ; k + + )
sum + = c4 [ k ] [ j ] * block2 [ 8 * ( 2 * k ) + i ] ;
block3 [ 8 * ( 2 * j ) + i ] = sum ;
/* bottom */
sum = 0 ;
for ( k = 0 ; k < 4 ; k + + )
sum + = c4 [ k ] [ j ] * block2 [ 8 * ( 2 * k + 1 ) + i ] ;
block3 [ 8 * ( 2 * j + 1 ) + i ] = sum ;
for ( k = 0 ; k < 4 ; k + + )
sum + = c4 [ k ] [ j ] * block2 [ 8 * ( 2 * k + 1 ) + i ] ;
block3 [ 8 * ( 2 * j + 1 ) + i ] = sum ;
}
}
/* clamp and store the result */
for ( i = 0 ; i < 8 ; i + + ) {
for ( j = 0 ; j < 8 ; j + + ) {
v = block3 [ 8 * i + j ] ;
if ( v < 0 )
v = 0 ;
else if ( v > 255 )
v = 255 ;
dest [ i * linesize + j ] = ( int ) rint ( v ) ;
for ( i = 0 ; i < 8 ; i + + ) {
for ( j = 0 ; j < 8 ; j + + ) {
v = block3 [ 8 * i + j ] ;
if ( v < 0 ) v = 0 ;
else if ( v > 255 ) v = 255 ;
dest [ i * linesize + j ] = ( int ) rint ( v ) ;
}
}
}
static void idct248_error ( const char * name ,
void ( * idct248_put ) ( uint8_t * dest , int line_size , int16_t * block ) )
void ( * idct248_put ) ( uint8_t * dest , int line_size ,
int16_t * block ) ,
int speed )
{
int it , i , it1 , ti , ti1 , err_max , v ;
AVLFG prng ;
av_lfg_init ( & prng , 1 ) ;
@ -472,23 +469,22 @@ static void idct248_error(const char *name,
/* just one test to see if code is correct (precision is less
important here ) */
err_max = 0 ;
for ( it = 0 ; it < NB_ITS ; it + + ) {
for ( it = 0 ; it < NB_ITS ; it + + ) {
/* XXX: use forward transform to generate values */
for ( i = 0 ; i < 64 ; i + + )
for ( i = 0 ; i < 64 ; i + + )
block1 [ i ] = av_lfg_get ( & prng ) % 256 - 128 ;
block1 [ 0 ] + = 1024 ;
for ( i = 0 ; i < 64 ; i + + )
block [ i ] = block1 [ i ] ;
for ( i = 0 ; i < 64 ; i + + )
block [ i ] = block1 [ i ] ;
idct248_ref ( img_dest1 , 8 , block ) ;
for ( i = 0 ; i < 64 ; i + + )
block [ i ] = block1 [ i ] ;
for ( i = 0 ; i < 64 ; i + + )
block [ i ] = block1 [ i ] ;
idct248_put ( img_dest , 8 , block ) ;
for ( i = 0 ; i < 64 ; i + + ) {
v = abs ( ( int ) img_dest [ i ] - ( int ) img_dest1 [ i ] ) ;
for ( i = 0 ; i < 64 ; i + + ) {
v = abs ( ( int ) img_dest [ i ] - ( int ) img_dest1 [ i ] ) ;
if ( v = = 255 )
printf ( " %d %d \n " , img_dest [ i ] , img_dest1 [ i ] ) ;
if ( v > err_max )
@ -514,18 +510,17 @@ static void idct248_error(const char *name,
}
# endif
}
printf ( " %s %s: err_inf=%d \n " ,
1 ? " IDCT248 " : " DCT248 " ,
name , err_max ) ;
printf ( " %s %s: err_inf=%d \n " , 1 ? " IDCT248 " : " DCT248 " , name , err_max ) ;
if ( ! speed )
return ;
ti = gettime ( ) ;
it1 = 0 ;
do {
for ( it = 0 ; it < NB_ITS_SPEED ; it + + ) {
for ( i = 0 ; i < 64 ; i + + )
block [ i ] = block1 [ i ] ;
// memcpy(block, block1, sizeof(DCTELEM) * 64);
// do not memcpy especially not fastmemcpy because it does movntq !!!
for ( it = 0 ; it < NB_ITS_SPEED ; it + + ) {
for ( i = 0 ; i < 64 ; i + + )
block [ i ] = block1 [ i ] ;
idct248_put ( img_dest , 8 , block ) ;
}
it1 + = NB_ITS_SPEED ;
@ -533,9 +528,8 @@ static void idct248_error(const char *name,
} while ( ti1 < 1000000 ) ;
mmx_emms ( ) ;
printf ( " %s %s: %0.1f kdct/s \n " ,
1 ? " IDCT248 " : " DCT248 " ,
name , ( double ) it1 * 1000.0 / ( double ) ti1 ) ;
printf ( " %s %s: %0.1f kdct/s \n " , 1 ? " IDCT248 " : " DCT248 " , name ,
( double ) it1 * 1000.0 / ( double ) ti1 ) ;
}
static void help ( void )
@ -545,56 +539,67 @@ static void help(void)
" 1 -> test with random sparse matrixes \n "
" 2 -> do 3. test from mpeg4 std \n "
" -i test IDCT implementations \n "
" -4 test IDCT248 implementations \n " ) ;
" -4 test IDCT248 implementations \n "
" -t speed test \n " ) ;
}
int main ( int argc , char * * argv )
{
int test_idct = 0 , test_248_dct = 0 ;
int c , i ;
int test = 1 ;
int c , i ;
int test = 1 ;
int speed = 0 ;
int err = 0 ;
int bits = 8 ;
cpu_flags = av_get_cpu_flags ( ) ;
ff_ref_dct_init ( ) ;
idct_mmx_init ( ) ;
for ( i = 0 ; i < 256 ; i + + ) cropTbl [ i + MAX_NEG_CROP ] = i ;
for ( i = 0 ; i < MAX_NEG_CROP ; i + + ) {
for ( i = 0 ; i < 256 ; i + + )
cropTbl [ i + MAX_NEG_CROP ] = i ;
for ( i = 0 ; i < MAX_NEG_CROP ; i + + ) {
cropTbl [ i ] = 0 ;
cropTbl [ i + MAX_NEG_CROP + 256 ] = 255 ;
}
for ( ; ; ) {
c = getopt ( argc , argv , " ih4 " ) ;
for ( ; ; ) {
c = getopt ( argc , argv , " ih4t " ) ;
if ( c = = - 1 )
break ;
switch ( c ) {
switch ( c ) {
case ' i ' :
test_idct = 1 ;
break ;
case ' 4 ' :
test_248_dct = 1 ;
break ;
default :
case ' t ' :
speed = 1 ;
break ;
default :
case ' h ' :
help ( ) ;
return 0 ;
}
}
if ( optind < argc ) test = atoi ( argv [ optind ] ) ;
if ( optind < argc )
test = atoi ( argv [ optind ] ) ;
if ( optind + 1 < argc ) bits = atoi ( argv [ optind + 1 ] ) ;
printf ( " ffmpeg DCT/IDCT test \n " ) ;
if ( test_248_dct ) {
idct248_error ( " SIMPLE-C " , ff_simple_idct248_put ) ;
idct248_error ( " SIMPLE-C " , ff_simple_idct248_put , speed ) ;
} else {
for ( i = 0 ; algos [ i ] . name ; i + + )
if ( algos [ i ] . is_idct = = test_idct & & ! ( ~ cpu_flags & algos [ i ] . mm_support ) ) {
dct_error ( algos [ i ] . name , algos [ i ] . is_idct , algos [ i ] . func , algos [ i ] . ref , algos [ i ] . format , test , bits ) ;
}
const struct algo * algos = test_idct ? idct_tab : fdct_tab ;
for ( i = 0 ; algos [ i ] . name ; i + + )
if ( ! ( ~ cpu_flags & algos [ i ] . mm_support ) ) {
err | = dct_error ( & algos [ i ] , test , test_idct , speed , bits ) ;
}
}
return 0 ;
return err ;
}