@ -35,40 +35,140 @@
AV_WN32 ( buf + j , rnd ( ) ) ; \
} while ( 0 )
// This reference function is the same approximate algorithm employed by the
// SIMD functions
static void ref_function ( const int16_t * filter , int filterSize ,
const int16_t * * src , uint8_t * dest , int dstW ,
const uint8_t * dither , int offset )
static void yuv2planeX_8_ref ( const int16_t * filter , int filterSize ,
const int16_t * * src , uint8_t * dest , int dstW ,
const uint8_t * dither , int offset )
{
int i , d ;
d = ( ( filterSize - 1 ) * 8 + dither [ 0 ] ) > > 4 ;
for ( i = 0 ; i < dstW ; i + + ) {
int16_t val = d ;
// This corresponds to the yuv2planeX_8_c function
int i ;
for ( i = 0 ; i < dstW ; i + + ) {
int val = dither [ ( i + offset ) & 7 ] < < 12 ;
int j ;
union {
int val ;
int16_t v [ 2 ] ;
} t ;
for ( j = 0 ; j < filterSize ; j + + ) {
t . val = ( int ) src [ j ] [ i + offset ] * ( int ) filter [ j ] ;
val + = t . v [ 1 ] ;
for ( j = 0 ; j < filterSize ; j + + )
val + = src [ j ] [ i ] * filter [ j ] ;
dest [ i ] = av_clip_uint8 ( val > > 19 ) ;
}
}
static int cmp_off_by_n ( const uint8_t * ref , const uint8_t * test , size_t n , int accuracy )
{
for ( size_t i = 0 ; i < n ; i + + ) {
if ( abs ( ref [ i ] - test [ i ] ) > accuracy )
return 1 ;
}
return 0 ;
}
static void print_data ( uint8_t * p , size_t len , size_t offset )
{
size_t i = 0 ;
for ( ; i < len ; i + + ) {
if ( i % 8 = = 0 ) {
printf ( " 0x%04zx: " , i + offset ) ;
}
printf ( " 0x%02x " , ( uint32_t ) p [ i ] ) ;
if ( i % 8 = = 7 ) {
printf ( " \n " ) ;
}
dest [ i ] = av_clip_uint8 ( val > > 3 ) ;
}
if ( i % 8 ! = 0 ) {
printf ( " \n " ) ;
}
}
static size_t show_differences ( uint8_t * a , uint8_t * b , size_t len )
{
for ( size_t i = 0 ; i < len ; i + + ) {
if ( a [ i ] ! = b [ i ] ) {
size_t offset_of_mismatch = i ;
size_t offset ;
if ( i > = 8 ) i - = 8 ;
offset = i & ( ~ 7 ) ;
printf ( " test a: \n " ) ;
print_data ( & a [ offset ] , 32 , offset ) ;
printf ( " \n test b: \n " ) ;
print_data ( & b [ offset ] , 32 , offset ) ;
printf ( " \n " ) ;
return offset_of_mismatch ;
}
}
return len ;
}
static void check_yuv2yuvX ( void )
static void check_yuv2yuv1 ( int accurate )
{
struct SwsContext * ctx ;
int osi , isi ;
int dstW , offset ;
size_t fail_offset ;
const int input_sizes [ ] = { 8 , 24 , 128 , 144 , 256 , 512 } ;
const int INPUT_SIZES = sizeof ( input_sizes ) / sizeof ( input_sizes [ 0 ] ) ;
# define LARGEST_INPUT_SIZE 512
const int offsets [ ] = { 0 , 3 , 8 , 11 , 16 , 19 } ;
const int OFFSET_SIZES = sizeof ( offsets ) / sizeof ( offsets [ 0 ] ) ;
const char * accurate_str = ( accurate ) ? " accurate " : " approximate " ;
declare_func_emms ( AV_CPU_FLAG_MMX , void ,
const int16_t * src , uint8_t * dest ,
int dstW , const uint8_t * dither , int offset ) ;
LOCAL_ALIGNED_8 ( int16_t , src_pixels , [ LARGEST_INPUT_SIZE ] ) ;
LOCAL_ALIGNED_8 ( uint8_t , dst0 , [ LARGEST_INPUT_SIZE ] ) ;
LOCAL_ALIGNED_8 ( uint8_t , dst1 , [ LARGEST_INPUT_SIZE ] ) ;
LOCAL_ALIGNED_8 ( uint8_t , dither , [ 8 ] ) ;
randomize_buffers ( ( uint8_t * ) dither , 8 ) ;
randomize_buffers ( ( uint8_t * ) src_pixels , LARGEST_INPUT_SIZE * sizeof ( int16_t ) ) ;
ctx = sws_alloc_context ( ) ;
if ( accurate )
ctx - > flags | = SWS_ACCURATE_RND ;
if ( sws_init_context ( ctx , NULL , NULL ) < 0 )
fail ( ) ;
ff_sws_init_scale ( ctx ) ;
for ( isi = 0 ; isi < INPUT_SIZES ; + + isi ) {
dstW = input_sizes [ isi ] ;
for ( osi = 0 ; osi < OFFSET_SIZES ; osi + + ) {
offset = offsets [ osi ] ;
if ( check_func ( ctx - > yuv2plane1 , " yuv2yuv1_%d_%d_%s " , offset , dstW , accurate_str ) ) {
memset ( dst0 , 0 , LARGEST_INPUT_SIZE * sizeof ( dst0 [ 0 ] ) ) ;
memset ( dst1 , 0 , LARGEST_INPUT_SIZE * sizeof ( dst1 [ 0 ] ) ) ;
call_ref ( src_pixels , dst0 , dstW , dither , offset ) ;
call_new ( src_pixels , dst1 , dstW , dither , offset ) ;
if ( cmp_off_by_n ( dst0 , dst1 , dstW * sizeof ( dst0 [ 0 ] ) , accurate ? 0 : 2 ) ) {
fail ( ) ;
printf ( " failed: yuv2yuv1_%d_%di_%s \n " , offset , dstW , accurate_str ) ;
fail_offset = show_differences ( dst0 , dst1 , LARGEST_INPUT_SIZE * sizeof ( dst0 [ 0 ] ) ) ;
printf ( " failing values: src: 0x%04x dither: 0x%02x dst-c: %02x dst-asm: %02x \n " ,
( int ) src_pixels [ fail_offset ] ,
( int ) dither [ ( fail_offset + fail_offset ) & 7 ] ,
( int ) dst0 [ fail_offset ] ,
( int ) dst1 [ fail_offset ] ) ;
}
if ( dstW = = LARGEST_INPUT_SIZE )
bench_new ( src_pixels , dst1 , dstW , dither , offset ) ;
}
}
}
sws_freeContext ( ctx ) ;
}
static void check_yuv2yuvX ( int accurate )
{
struct SwsContext * ctx ;
int fsi , osi , isi , i , j ;
int dstW ;
# define LARGEST_FILTER 16
# define FILTER_SIZES 4
static const int filter_sizes [ FILTER_SIZES ] = { 1 , 4 , 8 , 16 } ;
// ff_yuv2planeX_8_sse2 can't handle odd filter sizes
const int filter_sizes [ ] = { 2 , 4 , 8 , 16 } ;
const int FILTER_SIZES = sizeof ( filter_sizes ) / sizeof ( filter_sizes [ 0 ] ) ;
# define LARGEST_INPUT_SIZE 512
# define INPUT_SIZES 6
static const int input_sizes [ INPUT_SIZES ] = { 8 , 24 , 128 , 144 , 256 , 512 } ;
static const int input_sizes [ ] = { 8 , 24 , 128 , 144 , 256 , 512 } ;
const int INPUT_SIZES = sizeof ( input_sizes ) / sizeof ( input_sizes [ 0 ] ) ;
const char * accurate_str = ( accurate ) ? " accurate " : " approximate " ;
declare_func_emms ( AV_CPU_FLAG_MMX , void , const int16_t * filter ,
int filterSize , const int16_t * * src , uint8_t * dest ,
@ -89,6 +189,8 @@ static void check_yuv2yuvX(void)
randomize_buffers ( ( uint8_t * ) src_pixels , LARGEST_FILTER * LARGEST_INPUT_SIZE * sizeof ( int16_t ) ) ;
randomize_buffers ( ( uint8_t * ) filter_coeff , LARGEST_FILTER * sizeof ( int16_t ) ) ;
ctx = sws_alloc_context ( ) ;
if ( accurate )
ctx - > flags | = SWS_ACCURATE_RND ;
if ( sws_init_context ( ctx , NULL , NULL ) < 0 )
fail ( ) ;
@ -96,33 +198,37 @@ static void check_yuv2yuvX(void)
for ( isi = 0 ; isi < INPUT_SIZES ; + + isi ) {
dstW = input_sizes [ isi ] ;
for ( osi = 0 ; osi < 64 ; osi + = 16 ) {
for ( fsi = 0 ; fsi < FILTER_SIZES ; + + fsi ) {
if ( dstW < = osi )
continue ;
for ( fsi = 0 ; fsi < FILTER_SIZES ; + + fsi ) {
src = av_malloc ( sizeof ( int16_t * ) * filter_sizes [ fsi ] ) ;
vFilterData = av_malloc ( ( filter_sizes [ fsi ] + 2 ) * sizeof ( union VFilterData ) ) ;
memset ( vFilterData , 0 , ( filter_sizes [ fsi ] + 2 ) * sizeof ( union VFilterData ) ) ;
for ( i = 0 ; i < filter_sizes [ fsi ] ; + + i ) {
for ( i = 0 ; i < filter_sizes [ fsi ] ; + + i ) {
src [ i ] = & src_pixels [ i * LARGEST_INPUT_SIZE ] ;
vFilterData [ i ] . src = src [ i ] ;
vFilterData [ i ] . src = src [ i ] - osi ;
for ( j = 0 ; j < 4 ; + + j )
vFilterData [ i ] . coeff [ j + 4 ] = filter_coeff [ i ] ;
}
if ( check_func ( ctx - > yuv2planeX , " yuv2yuvX_%d_%d_%d " , filter_sizes [ fsi ] , osi , dstW ) ) {
if ( check_func ( ctx - > yuv2planeX , " yuv2yuvX_%d_%d_%d_%s " , filter_sizes [ fsi ] , osi , dstW , accurate_str ) ) {
// use vFilterData for the mmx function
const int16_t * filter = ctx - > use_mmx_vfilter ? ( const int16_t * ) vFilterData : & filter_coeff [ 0 ] ;
memset ( dst0 , 0 , LARGEST_INPUT_SIZE * sizeof ( dst0 [ 0 ] ) ) ;
memset ( dst1 , 0 , LARGEST_INPUT_SIZE * sizeof ( dst1 [ 0 ] ) ) ;
// The reference function is not the scalar function selected when mmx
// is deactivated as the SIMD functions do not give the same result as
// the scalar ones due to rounding. The SIMD functions are activated by
// the flag SWS_ACCURATE_RND
ref_function ( & filter_coeff [ 0 ] , filter_sizes [ fsi ] , src , dst0 , dstW - osi , dither , osi ) ;
// There's no point in calling new for the reference function
if ( ctx - > use_mmx_vfilter ) {
call_new ( ( const int16_t * ) vFilterData , filter_sizes [ fsi ] , src , dst1 , dstW - osi , dither , osi ) ;
if ( memcmp ( dst0 , dst1 , LARGEST_INPUT_SIZE * sizeof ( dst0 [ 0 ] ) ) )
fail ( ) ;
if ( dstW = = LARGEST_INPUT_SIZE )
bench_new ( ( const int16_t * ) vFilterData , filter_sizes [ fsi ] , src , dst1 , dstW - osi , dither , osi ) ;
// We can't use call_ref here, because we don't know if use_mmx_vfilter was set for that
// function or not, so we can't pass it the parameters correctly.
yuv2planeX_8_ref ( & filter_coeff [ 0 ] , filter_sizes [ fsi ] , src , dst0 , dstW - osi , dither , osi ) ;
call_new ( filter , filter_sizes [ fsi ] , src , dst1 , dstW - osi , dither , osi ) ;
if ( cmp_off_by_n ( dst0 , dst1 , LARGEST_INPUT_SIZE * sizeof ( dst0 [ 0 ] ) , accurate ? 0 : 2 ) ) {
fail ( ) ;
printf ( " failed: yuv2yuvX_%d_%d_%d_%s \n " , filter_sizes [ fsi ] , osi , dstW , accurate_str ) ;
show_differences ( dst0 , dst1 , LARGEST_INPUT_SIZE * sizeof ( dst0 [ 0 ] ) ) ;
}
if ( dstW = = LARGEST_INPUT_SIZE )
bench_new ( ( const int16_t * ) vFilterData , filter_sizes [ fsi ] , src , dst1 , dstW - osi , dither , osi ) ;
}
av_freep ( & src ) ;
av_freep ( & vFilterData ) ;
@ -245,6 +351,10 @@ void checkasm_check_sw_scale(void)
{
check_hscale ( ) ;
report ( " hscale " ) ;
check_yuv2yuvX ( ) ;
check_yuv2yuv1 ( 0 ) ;
check_yuv2yuv1 ( 1 ) ;
report ( " yuv2yuv1 " ) ;
check_yuv2yuvX ( 0 ) ;
check_yuv2yuvX ( 1 ) ;
report ( " yuv2yuvX " ) ;
}