@ -75,7 +75,7 @@ static void ff_h264_idct_add_mmx(uint8_t *dst, int16_t *block, int stride)
IDCT4_1D ( % % mm4 , % % mm2 , % % mm3 , % % mm0 , % % mm1 )
" pxor %%mm7, %%mm7 \n \t "
: : " m " ( ff_pw_32 ) ) ;
: : " m " ( * ff_pw_32 ) ) ;
asm volatile (
STORE_DIFF_4P ( % % mm0 , % % mm1 , % % mm7 )
@ -211,6 +211,93 @@ static void ff_h264_idct8_add_mmx(uint8_t *dst, int16_t *block, int stride)
add_pixels_clamped_mmx ( b2 , dst , stride ) ;
}
# define STORE_DIFF_8P( p, d, t, z )\
" movq " # d " , " # t " \n " \
" psraw $6, " # p " \n " \
" punpcklbw " # z " , " # t " \n " \
" paddsw " # t " , " # p " \n " \
" packuswb " # p " , " # p " \n " \
" movq " # p " , " # d " \n "
# define H264_IDCT8_1D_SSE2(a,b,c,d,e,f,g,h)\
" movdqa " # c " , " # a " \n " \
" movdqa " # g " , " # e " \n " \
" psraw $1, " # c " \n " \
" psraw $1, " # g " \n " \
" psubw " # e " , " # c " \n " \
" paddw " # a " , " # g " \n " \
" movdqa " # b " , " # e " \n " \
" psraw $1, " # e " \n " \
" paddw " # b " , " # e " \n " \
" paddw " # d " , " # e " \n " \
" paddw " # f " , " # e " \n " \
" movdqa " # f " , " # a " \n " \
" psraw $1, " # a " \n " \
" paddw " # f " , " # a " \n " \
" paddw " # h " , " # a " \n " \
" psubw " # b " , " # a " \n " \
" psubw " # d " , " # b " \n " \
" psubw " # d " , " # f " \n " \
" paddw " # h " , " # b " \n " \
" psubw " # h " , " # f " \n " \
" psraw $1, " # d " \n " \
" psraw $1, " # h " \n " \
" psubw " # d " , " # b " \n " \
" psubw " # h " , " # f " \n " \
" movdqa " # e " , " # d " \n " \
" movdqa " # a " , " # h " \n " \
" psraw $2, " # d " \n " \
" psraw $2, " # h " \n " \
" paddw " # f " , " # d " \n " \
" paddw " # b " , " # h " \n " \
" psraw $2, " # f " \n " \
" psraw $2, " # b " \n " \
" psubw " # f " , " # e " \n " \
" psubw " # a " , " # b " \n " \
" movdqa 0x00(%1), " # a " \n " \
" movdqa 0x40(%1), " # f " \n " \
SUMSUB_BA ( f , a ) \
SUMSUB_BA ( g , f ) \
SUMSUB_BA ( c , a ) \
SUMSUB_BA ( e , g ) \
SUMSUB_BA ( b , c ) \
SUMSUB_BA ( h , a ) \
SUMSUB_BA ( d , f )
static void ff_h264_idct8_add_sse2 ( uint8_t * dst , int16_t * block , int stride )
{
asm volatile (
" movdqa 0x10(%1), %%xmm1 \n "
" movdqa 0x20(%1), %%xmm2 \n "
" movdqa 0x30(%1), %%xmm3 \n "
" movdqa 0x50(%1), %%xmm5 \n "
" movdqa 0x60(%1), %%xmm6 \n "
" movdqa 0x70(%1), %%xmm7 \n "
H264_IDCT8_1D_SSE2 ( % % xmm0 , % % xmm1 , % % xmm2 , % % xmm3 , % % xmm4 , % % xmm5 , % % xmm6 , % % xmm7 )
TRANSPOSE8 ( % % xmm4 , % % xmm1 , % % xmm7 , % % xmm3 , % % xmm5 , % % xmm0 , % % xmm2 , % % xmm6 , ( % 1 ) )
" paddw %4, %%xmm4 \n "
" movdqa %%xmm4, 0x00(%1) \n "
" movdqa %%xmm2, 0x40(%1) \n "
H264_IDCT8_1D_SSE2 ( % % xmm4 , % % xmm0 , % % xmm6 , % % xmm3 , % % xmm2 , % % xmm5 , % % xmm7 , % % xmm1 )
" movdqa %%xmm6, 0x60(%1) \n "
" movdqa %%xmm7, 0x70(%1) \n "
" pxor %%xmm7, %%xmm7 \n "
STORE_DIFF_8P ( % % xmm2 , ( % 0 ) , % % xmm6 , % % xmm7 )
STORE_DIFF_8P ( % % xmm0 , ( % 0 , % 2 ) , % % xmm6 , % % xmm7 )
STORE_DIFF_8P ( % % xmm1 , ( % 0 , % 2 , 2 ) , % % xmm6 , % % xmm7 )
STORE_DIFF_8P ( % % xmm3 , ( % 0 , % 3 ) , % % xmm6 , % % xmm7 )
" lea (%0,%2,4), %0 \n "
STORE_DIFF_8P ( % % xmm5 , ( % 0 ) , % % xmm6 , % % xmm7 )
STORE_DIFF_8P ( % % xmm4 , ( % 0 , % 2 ) , % % xmm6 , % % xmm7 )
" movdqa 0x60(%1), %%xmm0 \n "
" movdqa 0x70(%1), %%xmm1 \n "
STORE_DIFF_8P ( % % xmm0 , ( % 0 , % 2 , 2 ) , % % xmm6 , % % xmm7 )
STORE_DIFF_8P ( % % xmm1 , ( % 0 , % 3 ) , % % xmm6 , % % xmm7 )
: " +r " ( dst )
: " r " ( block ) , " r " ( ( long ) stride ) , " r " ( 3L * stride ) , " m " ( * ff_pw_32 )
) ;
}
static void ff_h264_idct_dc_add_mmx2 ( uint8_t * dst , int16_t * block , int stride )
{
int dc = ( block [ 0 ] + 32 ) > > 6 ;
@ -839,7 +926,7 @@ static av_noinline void OPNAME ## h264_qpel4_hv_lowpass_ ## MMX(uint8_t *dst, in
" decl %2 \n \t " \
" jnz 1b \n \t " \
: " +a " ( tmp ) , " +c " ( dst ) , " +m " ( h ) \
: " S " ( ( long ) dstStride ) , " m " ( ff_pw_32 ) \
: " S " ( ( long ) dstStride ) , " m " ( * ff_pw_32 ) \
: " memory " \
) ; \
} \
@ -1113,7 +1200,7 @@ static av_noinline void OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst
" decl %2 \n \t " \
" jnz 1b \n \t " \
: " +a " ( tmp ) , " +c " ( dst ) , " +m " ( h ) \
: " S " ( ( long ) dstStride ) , " m " ( ff_pw_32 ) \
: " S " ( ( long ) dstStride ) , " m " ( * ff_pw_32 ) \
: " memory " \
) ; \
tmp + = 8 - size * 24 ; \