@ -30,12 +30,208 @@
# include "libavutil/mem_internal.h"
# define VC1DSP_TEST(func) { #func, offsetof(VC1DSPContext, func) },
# define VC1DSP_SIZED_TEST(func, width, height) { #func, offsetof(VC1DSPContext, func), width, height },
typedef struct {
const char * name ;
size_t offset ;
int width ;
int height ;
} test ;
typedef struct matrix {
size_t width ;
size_t height ;
float d [ ] ;
} matrix ;
static const matrix T8 = { 8 , 8 , {
12 , 12 , 12 , 12 , 12 , 12 , 12 , 12 ,
16 , 15 , 9 , 4 , - 4 , - 9 , - 15 , - 16 ,
16 , 6 , - 6 , - 16 , - 16 , - 6 , 6 , 16 ,
15 , - 4 , - 16 , - 9 , 9 , 16 , 4 , - 15 ,
12 , - 12 , - 12 , 12 , 12 , - 12 , - 12 , 12 ,
9 , - 16 , 4 , 15 , - 15 , - 4 , 16 , - 9 ,
6 , - 16 , 16 , - 6 , - 6 , 16 , - 16 , 6 ,
4 , - 9 , 15 , - 16 , 16 , - 15 , 9 , - 4
} } ;
static const matrix T4 = { 4 , 4 , {
17 , 17 , 17 , 17 ,
22 , 10 , - 10 , - 22 ,
17 , - 17 , - 17 , 17 ,
10 , - 22 , 22 , - 10
} } ;
static const matrix T8t = { 8 , 8 , {
12 , 16 , 16 , 15 , 12 , 9 , 6 , 4 ,
12 , 15 , 6 , - 4 , - 12 , - 16 , - 16 , - 9 ,
12 , 9 , - 6 , - 16 , - 12 , 4 , 16 , 15 ,
12 , 4 , - 16 , - 9 , 12 , 15 , - 6 , - 16 ,
12 , - 4 , - 16 , 9 , 12 , - 15 , - 6 , 16 ,
12 , - 9 , - 6 , 16 , - 12 , - 4 , 16 , - 15 ,
12 , - 15 , 6 , 4 , - 12 , 16 , - 16 , 9 ,
12 , - 16 , 16 , - 15 , 12 , - 9 , 6 , - 4
} } ;
static const matrix T4t = { 4 , 4 , {
17 , 22 , 17 , 10 ,
17 , 10 , - 17 , - 22 ,
17 , - 10 , - 17 , 22 ,
17 , - 22 , 17 , - 10
} } ;
static matrix * new_matrix ( size_t width , size_t height )
{
matrix * out = av_mallocz ( sizeof ( matrix ) + height * width * sizeof ( float ) ) ;
if ( out = = NULL ) {
fprintf ( stderr , " Memory allocation failure \n " ) ;
exit ( EXIT_FAILURE ) ;
}
out - > width = width ;
out - > height = height ;
return out ;
}
static matrix * multiply ( const matrix * a , const matrix * b )
{
matrix * out ;
if ( a - > width ! = b - > height ) {
fprintf ( stderr , " Incompatible multiplication \n " ) ;
exit ( EXIT_FAILURE ) ;
}
out = new_matrix ( b - > width , a - > height ) ;
for ( int j = 0 ; j < out - > height ; + + j )
for ( int i = 0 ; i < out - > width ; + + i ) {
float sum = 0 ;
for ( int k = 0 ; k < a - > width ; + + k )
sum + = a - > d [ j * a - > width + k ] * b - > d [ k * b - > width + i ] ;
out - > d [ j * out - > width + i ] = sum ;
}
return out ;
}
static void normalise ( matrix * a )
{
for ( int j = 0 ; j < a - > height ; + + j )
for ( int i = 0 ; i < a - > width ; + + i ) {
float * p = a - > d + j * a - > width + i ;
* p * = 64 ;
if ( a - > height = = 4 )
* p / = ( const unsigned [ ] ) { 289 , 292 , 289 , 292 } [ j ] ;
else
* p / = ( const unsigned [ ] ) { 288 , 289 , 292 , 289 , 288 , 289 , 292 , 289 } [ j ] ;
if ( a - > width = = 4 )
* p / = ( const unsigned [ ] ) { 289 , 292 , 289 , 292 } [ i ] ;
else
* p / = ( const unsigned [ ] ) { 288 , 289 , 292 , 289 , 288 , 289 , 292 , 289 } [ i ] ;
}
}
static void divide_and_round_nearest ( matrix * a , float by )
{
for ( int j = 0 ; j < a - > height ; + + j )
for ( int i = 0 ; i < a - > width ; + + i ) {
float * p = a - > d + j * a - > width + i ;
* p = rintf ( * p / by ) ;
}
}
static void tweak ( matrix * a )
{
for ( int j = 4 ; j < a - > height ; + + j )
for ( int i = 0 ; i < a - > width ; + + i ) {
float * p = a - > d + j * a - > width + i ;
* p + = 1 ;
}
}
/* The VC-1 spec places restrictions on the values permitted at three
* different stages :
* - D : the input coefficients in frequency domain
* - E : the intermediate coefficients , inverse - transformed only horizontally
* - R : the fully inverse - transformed coefficients
*
* To fully cater for the ranges specified requires various intermediate
* values to be held to 17 - bit precision ; yet these conditions do not appear
* to be utilised in real - world streams . At least some assembly
* implementations have chosen to restrict these values to 16 - bit precision ,
* to accelerate the decoding of real - world streams at the cost of strict
* adherence to the spec . To avoid our test marking these as failures ,
* reduce our random inputs .
*/
# define ATTENUATION 4
static matrix * generate_inverse_quantized_transform_coefficients ( size_t width , size_t height )
{
matrix * raw , * tmp , * D , * E , * R ;
raw = new_matrix ( width , height ) ;
for ( int i = 0 ; i < width * height ; + + i )
raw - > d [ i ] = ( int ) ( rnd ( ) % ( 1024 / ATTENUATION ) ) - 512 / ATTENUATION ;
tmp = multiply ( height = = 8 ? & T8 : & T4 , raw ) ;
D = multiply ( tmp , width = = 8 ? & T8t : & T4t ) ;
normalise ( D ) ;
divide_and_round_nearest ( D , 1 ) ;
for ( int i = 0 ; i < width * height ; + + i ) {
if ( D - > d [ i ] < - 2048 / ATTENUATION | | D - > d [ i ] > 2048 / ATTENUATION - 1 ) {
/* Rare, so simply try again */
av_free ( raw ) ;
av_free ( tmp ) ;
av_free ( D ) ;
return generate_inverse_quantized_transform_coefficients ( width , height ) ;
}
}
E = multiply ( D , width = = 8 ? & T8 : & T4 ) ;
divide_and_round_nearest ( E , 8 ) ;
for ( int i = 0 ; i < width * height ; + + i )
if ( E - > d [ i ] < - 4096 / ATTENUATION | | E - > d [ i ] > 4096 / ATTENUATION - 1 ) {
/* Rare, so simply try again */
av_free ( raw ) ;
av_free ( tmp ) ;
av_free ( D ) ;
av_free ( E ) ;
return generate_inverse_quantized_transform_coefficients ( width , height ) ;
}
R = multiply ( height = = 8 ? & T8t : & T4t , E ) ;
tweak ( R ) ;
divide_and_round_nearest ( R , 128 ) ;
for ( int i = 0 ; i < width * height ; + + i )
if ( R - > d [ i ] < - 512 / ATTENUATION | | R - > d [ i ] > 512 / ATTENUATION - 1 ) {
/* Rare, so simply try again */
av_free ( raw ) ;
av_free ( tmp ) ;
av_free ( D ) ;
av_free ( E ) ;
av_free ( R ) ;
return generate_inverse_quantized_transform_coefficients ( width , height ) ;
}
av_free ( raw ) ;
av_free ( tmp ) ;
av_free ( E ) ;
av_free ( R ) ;
return D ;
}
# define RANDOMIZE_BUFFER16(name, size) \
do { \
int i ; \
for ( i = 0 ; i < size ; + + i ) { \
uint16_t r = rnd ( ) ; \
AV_WN16A ( name # # 0 + i , r ) ; \
AV_WN16A ( name # # 1 + i , r ) ; \
} \
} while ( 0 )
# define RANDOMIZE_BUFFER8(name, size) \
do { \
int i ; \
for ( i = 0 ; i < size ; + + i ) { \
uint8_t r = rnd ( ) ; \
name # # 0 [ i ] = r ; \
name # # 1 [ i ] = r ; \
} \
} while ( 0 )
# define RANDOMIZE_BUFFER8_MID_WEIGHTED(name, size) \
do { \
uint8_t * p # # 0 = name # # 0 , * p # # 1 = name # # 1 ; \
@ -49,6 +245,89 @@ typedef struct {
} \
} while ( 0 )
static void check_inv_trans_inplace ( void )
{
/* Inverse transform input coefficients are stored in a 16-bit buffer
* with row stride of 8 coefficients irrespective of transform size .
* vc1_inv_trans_8x8 differs from the others in two ways : coefficients
* are stored in column - major order , and the outputs are written back
* to the input buffer , so we oversize it slightly to catch overruns . */
LOCAL_ALIGNED_16 ( int16_t , inv_trans_in0 , [ 10 * 8 ] ) ;
LOCAL_ALIGNED_16 ( int16_t , inv_trans_in1 , [ 10 * 8 ] ) ;
VC1DSPContext h ;
ff_vc1dsp_init ( & h ) ;
if ( check_func ( h . vc1_inv_trans_8x8 , " vc1dsp.vc1_inv_trans_8x8 " ) ) {
matrix * coeffs ;
declare_func_emms ( AV_CPU_FLAG_MMX , void , int16_t * ) ;
RANDOMIZE_BUFFER16 ( inv_trans_in , 10 * 8 ) ;
coeffs = generate_inverse_quantized_transform_coefficients ( 8 , 8 ) ;
for ( int j = 0 ; j < 8 ; + + j )
for ( int i = 0 ; i < 8 ; + + i ) {
int idx = 8 + i * 8 + j ;
inv_trans_in1 [ idx ] = inv_trans_in0 [ idx ] = coeffs - > d [ j * 8 + i ] ;
}
call_ref ( inv_trans_in0 + 8 ) ;
call_new ( inv_trans_in1 + 8 ) ;
if ( memcmp ( inv_trans_in0 , inv_trans_in1 , 10 * 8 * sizeof ( int16_t ) ) )
fail ( ) ;
bench_new ( inv_trans_in1 + 8 ) ;
av_free ( coeffs ) ;
}
}
static void check_inv_trans_adding ( void )
{
/* Inverse transform input coefficients are stored in a 16-bit buffer
* with row stride of 8 coefficients irrespective of transform size . */
LOCAL_ALIGNED_16 ( int16_t , inv_trans_in0 , [ 8 * 8 ] ) ;
LOCAL_ALIGNED_16 ( int16_t , inv_trans_in1 , [ 8 * 8 ] ) ;
/* For all but vc1_inv_trans_8x8, the inverse transform is narrowed and
* added with saturation to an array of unsigned 8 - bit values . Oversize
* this by 8 samples left and right and one row above and below . */
LOCAL_ALIGNED_8 ( uint8_t , inv_trans_out0 , [ 10 * 24 ] ) ;
LOCAL_ALIGNED_8 ( uint8_t , inv_trans_out1 , [ 10 * 24 ] ) ;
VC1DSPContext h ;
const test tests [ ] = {
VC1DSP_SIZED_TEST ( vc1_inv_trans_8x4 , 8 , 4 )
VC1DSP_SIZED_TEST ( vc1_inv_trans_4x8 , 4 , 8 )
VC1DSP_SIZED_TEST ( vc1_inv_trans_4x4 , 4 , 4 )
VC1DSP_SIZED_TEST ( vc1_inv_trans_8x8_dc , 8 , 8 )
VC1DSP_SIZED_TEST ( vc1_inv_trans_8x4_dc , 8 , 4 )
VC1DSP_SIZED_TEST ( vc1_inv_trans_4x8_dc , 4 , 8 )
VC1DSP_SIZED_TEST ( vc1_inv_trans_4x4_dc , 4 , 4 )
} ;
ff_vc1dsp_init ( & h ) ;
for ( size_t t = 0 ; t < FF_ARRAY_ELEMS ( tests ) ; + + t ) {
void ( * func ) ( uint8_t * , ptrdiff_t , int16_t * ) = * ( void * * ) ( ( intptr_t ) & h + tests [ t ] . offset ) ;
if ( check_func ( func , " vc1dsp.%s " , tests [ t ] . name ) ) {
matrix * coeffs ;
declare_func_emms ( AV_CPU_FLAG_MMX , void , uint8_t * , ptrdiff_t , int16_t * ) ;
RANDOMIZE_BUFFER16 ( inv_trans_in , 8 * 8 ) ;
RANDOMIZE_BUFFER8 ( inv_trans_out , 10 * 24 ) ;
coeffs = generate_inverse_quantized_transform_coefficients ( tests [ t ] . width , tests [ t ] . height ) ;
for ( int j = 0 ; j < tests [ t ] . height ; + + j )
for ( int i = 0 ; i < tests [ t ] . width ; + + i ) {
int idx = j * 8 + i ;
inv_trans_in1 [ idx ] = inv_trans_in0 [ idx ] = coeffs - > d [ j * tests [ t ] . width + i ] ;
}
call_ref ( inv_trans_out0 + 24 + 8 , 24 , inv_trans_in0 ) ;
call_new ( inv_trans_out1 + 24 + 8 , 24 , inv_trans_in1 ) ;
if ( memcmp ( inv_trans_out0 , inv_trans_out1 , 10 * 24 ) )
fail ( ) ;
bench_new ( inv_trans_out1 + 24 + 8 , 24 , inv_trans_in1 + 8 ) ;
av_free ( coeffs ) ;
}
}
}
static void check_loop_filter ( void )
{
/* Deblocking filter buffers are big enough to hold a 16x16 block,
@ -97,6 +376,10 @@ static void check_loop_filter(void)
void checkasm_check_vc1dsp ( void )
{
check_inv_trans_inplace ( ) ;
check_inv_trans_adding ( ) ;
report ( " inv_trans " ) ;
check_loop_filter ( ) ;
report ( " loop_filter " ) ;
}