@ -65,6 +65,9 @@ static const uint64_t ff_pb_A1 attribute_used __attribute__ ((aligned(8))) = 0xA
static const uint64_t ff_pb_5F attribute_used __attribute__ ( ( aligned ( 8 ) ) ) = 0x5F5F5F5F5F5F5F5FULL ;
static const uint64_t ff_pb_FC attribute_used __attribute__ ( ( aligned ( 8 ) ) ) = 0xFCFCFCFCFCFCFCFCULL ;
static const double ff_pd_1 [ 2 ] attribute_used __attribute__ ( ( aligned ( 16 ) ) ) = { 1.0 , 1.0 } ;
static const double ff_pd_2 [ 2 ] attribute_used __attribute__ ( ( aligned ( 16 ) ) ) = { 2.0 , 2.0 } ;
# define JUMPALIGN() __asm __volatile (ASMALIGN(3)::)
# define MOVQ_ZERO(regd) __asm __volatile ("pxor %%" #regd ", %%" #regd ::)
@ -2958,6 +2961,125 @@ static void vorbis_inverse_coupling_sse(float *mag, float *ang, int blocksize)
}
}
# ifdef CONFIG_ENCODERS
static void apply_welch_window_sse2 ( const int32_t * data , int len , double * w_data )
{
double c = 2.0 / ( len - 1.0 ) ;
int n2 = len > > 1 ;
long i = - n2 * sizeof ( int32_t ) ;
long j = n2 * sizeof ( int32_t ) ;
asm volatile (
" movsd %0, %%xmm7 \n \t "
" movapd %1, %%xmm6 \n \t "
" movapd %2, %%xmm5 \n \t "
" movlhps %%xmm7, %%xmm7 \n \t "
" subpd %%xmm5, %%xmm7 \n \t "
" addsd %%xmm6, %%xmm7 \n \t "
: : " m " ( c ) , " m " ( * ff_pd_1 ) , " m " ( * ff_pd_2 )
) ;
# define WELCH(MOVPD)\
asm volatile ( \
" 1: \n \t " \
" movapd %%xmm7, %%xmm1 \n \t " \
" mulpd %%xmm1, %%xmm1 \n \t " \
" movapd %%xmm6, %%xmm0 \n \t " \
" subpd %%xmm1, %%xmm0 \n \t " \
" pshufd $0x4e, %%xmm0, %%xmm1 \n \t " \
" cvtpi2pd (%4,%0), %%xmm2 \n \t " \
" cvtpi2pd (%5,%1), %%xmm3 \n \t " \
" mulpd %%xmm0, %%xmm2 \n \t " \
" mulpd %%xmm1, %%xmm3 \n \t " \
" movapd %%xmm2, (%2,%0,2) \n \t " \
MOVPD " %%xmm3, (%3,%1,2) \n \t " \
" subpd %%xmm5, %%xmm7 \n \t " \
" sub $8, %1 \n \t " \
" add $8, %0 \n \t " \
" jl 1b \n \t " \
: " +&r " ( i ) , " +&r " ( j ) \
: " r " ( w_data + n2 ) , " r " ( w_data + len - 2 - n2 ) , \
" r " ( data + n2 ) , " r " ( data + len - 2 - n2 ) \
) ;
if ( len & 1 )
WELCH ( " movupd " )
else
WELCH ( " movapd " )
# undef WELCH
}
static void flac_compute_autocorr_sse2 ( const int32_t * data , int len , int lag ,
double * autoc )
{
double tmp [ len + lag + 2 ] ;
double * data1 = tmp + lag ;
int j ;
if ( ( long ) data1 & 15 )
data1 + + ;
apply_welch_window_sse2 ( data , len , data1 ) ;
for ( j = 0 ; j < lag ; j + + )
data1 [ j - lag ] = 0.0 ;
data1 [ len ] = 0.0 ;
for ( j = 0 ; j < lag ; j + = 2 ) {
long i = - len * sizeof ( double ) ;
if ( j = = lag - 2 ) {
asm volatile (
" movsd %6, %%xmm0 \n \t "
" movsd %6, %%xmm1 \n \t "
" movsd %6, %%xmm2 \n \t "
" 1: \n \t "
" movapd (%4,%0), %%xmm3 \n \t "
" movupd -8(%5,%0), %%xmm4 \n \t "
" movapd (%5,%0), %%xmm5 \n \t "
" mulpd %%xmm3, %%xmm4 \n \t "
" mulpd %%xmm3, %%xmm5 \n \t "
" mulpd -16(%5,%0), %%xmm3 \n \t "
" addpd %%xmm4, %%xmm1 \n \t "
" addpd %%xmm5, %%xmm0 \n \t "
" addpd %%xmm3, %%xmm2 \n \t "
" add $16, %0 \n \t "
" jl 1b \n \t "
" movhlps %%xmm0, %%xmm3 \n \t "
" movhlps %%xmm1, %%xmm4 \n \t "
" movhlps %%xmm2, %%xmm5 \n \t "
" addsd %%xmm3, %%xmm0 \n \t "
" addsd %%xmm4, %%xmm1 \n \t "
" addsd %%xmm5, %%xmm2 \n \t "
" movsd %%xmm0, %1 \n \t "
" movsd %%xmm1, %2 \n \t "
" movsd %%xmm2, %3 \n \t "
: " +&r " ( i ) , " =m " ( autoc [ j ] ) , " =m " ( autoc [ j + 1 ] ) , " =m " ( autoc [ j + 2 ] )
: " r " ( data1 + len ) , " r " ( data1 + len - j ) , " m " ( * ff_pd_1 )
) ;
} else {
asm volatile (
" movsd %5, %%xmm0 \n \t "
" movsd %5, %%xmm1 \n \t "
" 1: \n \t "
" movapd (%3,%0), %%xmm3 \n \t "
" movupd -8(%4,%0), %%xmm4 \n \t "
" mulpd %%xmm3, %%xmm4 \n \t "
" mulpd (%4,%0), %%xmm3 \n \t "
" addpd %%xmm4, %%xmm1 \n \t "
" addpd %%xmm3, %%xmm0 \n \t "
" add $16, %0 \n \t "
" jl 1b \n \t "
" movhlps %%xmm0, %%xmm3 \n \t "
" movhlps %%xmm1, %%xmm4 \n \t "
" addsd %%xmm3, %%xmm0 \n \t "
" addsd %%xmm4, %%xmm1 \n \t "
" movsd %%xmm0, %1 \n \t "
" movsd %%xmm1, %2 \n \t "
: " +&r " ( i ) , " =m " ( autoc [ j ] ) , " =m " ( autoc [ j + 1 ] )
: " r " ( data1 + len ) , " r " ( data1 + len - j ) , " m " ( * ff_pd_1 )
) ;
}
}
}
# endif // CONFIG_ENCODERS
static void vector_fmul_3dnow ( float * dst , const float * src , int len ) {
long i = ( len - 4 ) * 4 ;
asm volatile (
@ -3605,6 +3727,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
c - > sum_abs_dctelem = sum_abs_dctelem_sse2 ;
c - > hadamard8_diff [ 0 ] = hadamard8_diff16_sse2 ;
c - > hadamard8_diff [ 1 ] = hadamard8_diff_sse2 ;
c - > flac_compute_autocorr = flac_compute_autocorr_sse2 ;
}
# ifdef HAVE_SSSE3