@ -721,38 +721,44 @@ static void h264_loop_filter_strength_mmx2( int16_t bS[2][4][4], uint8_t nnz[40]
/***********************************/
/* motion compensation */
# define QPEL_H264V(A,B,C,D,E,F,OP)\
" movd (%0), " # F " \n \t " \
" movq " # C " , %%mm6 \n \t " \
" paddw " # D " , %%mm6 \n \t " \
" psllw $2, %%mm6 \n \t " \
" psubw " # B " , %%mm6 \n \t " \
" psubw " # E " , %%mm6 \n \t " \
" pmullw %4, %%mm6 \n \t " \
" add %2, %0 \n \t " \
" punpcklbw %%mm7, " # F " \n \t " \
# define QPEL_H264V_MM(A,B,C,D,E,F,OP,T,Z,d,q)\
" mov " # q " " # C " , " # T " \n \t " \
" mov " # d " (%0), " # F " \n \t " \
" paddw " # D " , " # T " \n \t " \
" psllw $2, " # T " \n \t " \
" psubw " # B " , " # T " \n \t " \
" psubw " # E " , " # T " \n \t " \
" punpcklbw " # Z " , " # F " \n \t " \
" pmullw %4, " # T " \n \t " \
" paddw %5, " # A " \n \t " \
" add %2, %0 \n \t " \
" paddw " # F " , " # A " \n \t " \
" paddw " # A " , %%mm6 \n \t " \
" psraw $5, %%mm6 \n \t " \
" packuswb %%mm6, %%mm6 \n \t " \
OP ( % % mm6 , ( % 1 ) , A , d ) \
" paddw " # A " , " # T " \n \t " \
" psraw $5, " # T " \n \t " \
" packuswb " # T " , " # T " \n \t " \
OP ( T , ( % 1 ) , A , d ) \
" add %3, %1 \n \t "
# define QPEL_H264HV(A,B,C,D,E,F,OF)\
" movd (%0), " # F " \n \t " \
" movq " # C " , %%mm6 \n \t " \
" paddw " # D " , %%mm6 \n \t " \
" psllw $2, %%mm6 \n \t " \
" psubw " # B " , %%mm6 \n \t " \
" psubw " # E " , %%mm6 \n \t " \
" pmullw %3, %%mm6 \n \t " \
" add %2, %0 \n \t " \
" punpcklbw %%mm7, " # F " \n \t " \
# define QPEL_H264HV_MM(A,B,C,D,E,F,OF,T,Z,d,q)\
" mov " # q " " # C " , " # T " \n \t " \
" mov " # d " (%0), " # F " \n \t " \
" paddw " # D " , " # T " \n \t " \
" psllw $2, " # T " \n \t " \
" paddw %4, " # A " \n \t " \
" psubw " # B " , " # T " \n \t " \
" psubw " # E " , " # T " \n \t " \
" punpcklbw " # Z " , " # F " \n \t " \
" pmullw %3, " # T " \n \t " \
" paddw " # F " , " # A " \n \t " \
" paddw " # A " , %%mm6 \n \t " \
" movq %%mm6, " # OF " (%1) \n \t "
" add %2, %0 \n \t " \
" paddw " # A " , " # T " \n \t " \
" mov " # q " " # T " , " # OF " (%1) \n \t "
# define QPEL_H264V(A,B,C,D,E,F,OP) QPEL_H264V_MM(A,B,C,D,E,F,OP,%%mm6,%%mm7,d,q)
# define QPEL_H264HV(A,B,C,D,E,F,OF) QPEL_H264HV_MM(A,B,C,D,E,F,OF,%%mm6,%%mm7,d,q)
# define QPEL_H264V_XMM(A,B,C,D,E,F,OP) QPEL_H264V_MM(A,B,C,D,E,F,OP,%%xmm6,%%xmm7,q,dqa)
# define QPEL_H264HV_XMM(A,B,C,D,E,F,OF) QPEL_H264HV_MM(A,B,C,D,E,F,OF,%%xmm6,%%xmm7,q,dqa)
# define QPEL_H264(OPNAME, OP, MMX)\
static av_noinline void OPNAME # # h264_qpel4_h_lowpass_ # # MMX ( uint8_t * dst , uint8_t * src , int dstStride , int srcStride ) { \
@ -1270,21 +1276,29 @@ static av_noinline void OPNAME ## pixels4_l2_shift5_ ## MMX(uint8_t *dst, int16_
} \
static av_noinline void OPNAME # # pixels8_l2_shift5_ # # MMX ( uint8_t * dst , int16_t * src16 , uint8_t * src8 , int dstStride , int src8Stride , int h ) \
{ \
while ( h - - ) { \
do { \
asm volatile ( \
" movq (%1), %%mm0 \n \t " \
" movq 8(%1), %%mm1 \n \t " \
" movq 48(%1), %%mm2 \n \t " \
" movq 8+48(%1), %%mm3 \n \t " \
" psraw $5, %%mm0 \n \t " \
" psraw $5, %%mm1 \n \t " \
" psraw $5, %%mm2 \n \t " \
" psraw $5, %%mm3 \n \t " \
" packuswb %%mm1, %%mm0 \n \t " \
" packuswb %%mm3, %%mm2 \n \t " \
PAVGB " (%0), %%mm0 \n \t " \
PAVGB " (%0,%3), %%mm2 \n \t " \
OP ( % % mm0 , ( % 2 ) , % % mm5 , q ) \
: : " a " ( src8 ) , " c " ( src16 ) , " d " ( dst ) \
OP ( % % mm2 , ( % 2 , % 4 ) , % % mm5 , q ) \
: : " a " ( src8 ) , " c " ( src16 ) , " d " ( dst ) , \
" r " ( ( long ) src8Stride ) , " r " ( ( long ) dstStride ) \
: " memory " ) ; \
src8 + = src8Stride ; \
src16 + = 2 4; \
dst + = dstStride ; \
} \
src8 + = 2L * src8Stride ; \
src16 + = 48 ; \
dst + = 2L * dstStride ; \
} while ( h - = 2 ) ; \
} \
static void OPNAME # # pixels16_l2_shift5_ # # MMX ( uint8_t * dst , int16_t * src16 , uint8_t * src8 , int dstStride , int src8Stride , int h ) \
{ \
@ -1292,8 +1306,364 @@ static void OPNAME ## pixels16_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, u
OPNAME # # pixels8_l2_shift5_ # # MMX ( dst + 8 , src16 + 8 , src8 + 8 , dstStride , src8Stride , h ) ; \
} \
# ifdef ARCH_X86_64
# define QPEL_H264_HL2_XMM(OPNAME, OP, MMX)\
static av_noinline void OPNAME # # h264_qpel16_h_lowpass_l2_ # # MMX ( uint8_t * dst , uint8_t * src , uint8_t * src2 , int dstStride , int src2Stride ) { \
int h = 16 ; \
asm volatile ( \
" pxor %%xmm15, %%xmm15 \n \t " \
" movdqa %6, %%xmm14 \n \t " \
" movdqa %7, %%xmm13 \n \t " \
" 1: \n \t " \
" lddqu 3(%0), %%xmm1 \n \t " \
" lddqu -5(%0), %%xmm7 \n \t " \
" movdqa %%xmm1, %%xmm0 \n \t " \
" punpckhbw %%xmm15, %%xmm1 \n \t " \
" punpcklbw %%xmm15, %%xmm0 \n \t " \
" punpcklbw %%xmm15, %%xmm7 \n \t " \
" movdqa %%xmm1, %%xmm2 \n \t " \
" movdqa %%xmm0, %%xmm6 \n \t " \
" movdqa %%xmm1, %%xmm3 \n \t " \
" movdqa %%xmm0, %%xmm8 \n \t " \
" movdqa %%xmm1, %%xmm4 \n \t " \
" movdqa %%xmm0, %%xmm9 \n \t " \
" movdqa %%xmm1, %%xmm5 \n \t " \
" movdqa %%xmm0, %%xmm10 \n \t " \
" palignr $6, %%xmm0, %%xmm5 \n \t " \
" palignr $6, %%xmm7, %%xmm10 \n \t " \
" palignr $8, %%xmm0, %%xmm4 \n \t " \
" palignr $8, %%xmm7, %%xmm9 \n \t " \
" palignr $10,%%xmm0, %%xmm3 \n \t " \
" palignr $10,%%xmm7, %%xmm8 \n \t " \
" paddw %%xmm1, %%xmm5 \n \t " \
" paddw %%xmm0, %%xmm10 \n \t " \
" palignr $12,%%xmm0, %%xmm2 \n \t " \
" palignr $12,%%xmm7, %%xmm6 \n \t " \
" palignr $14,%%xmm0, %%xmm1 \n \t " \
" palignr $14,%%xmm7, %%xmm0 \n \t " \
" paddw %%xmm3, %%xmm2 \n \t " \
" paddw %%xmm8, %%xmm6 \n \t " \
" paddw %%xmm4, %%xmm1 \n \t " \
" paddw %%xmm9, %%xmm0 \n \t " \
" psllw $2, %%xmm2 \n \t " \
" psllw $2, %%xmm6 \n \t " \
" psubw %%xmm1, %%xmm2 \n \t " \
" psubw %%xmm0, %%xmm6 \n \t " \
" paddw %%xmm13,%%xmm5 \n \t " \
" paddw %%xmm13,%%xmm10 \n \t " \
" pmullw %%xmm14,%%xmm2 \n \t " \
" pmullw %%xmm14,%%xmm6 \n \t " \
" lddqu (%2), %%xmm3 \n \t " \
" paddw %%xmm5, %%xmm2 \n \t " \
" paddw %%xmm10,%%xmm6 \n \t " \
" psraw $5, %%xmm2 \n \t " \
" psraw $5, %%xmm6 \n \t " \
" packuswb %%xmm2,%%xmm6 \n \t " \
" pavgb %%xmm3, %%xmm6 \n \t " \
OP ( % % xmm6 , ( % 1 ) , % % xmm4 , dqa ) \
" add %5, %0 \n \t " \
" add %5, %1 \n \t " \
" add %4, %2 \n \t " \
" dec %3 \n \t " \
" jg 1b \n \t " \
: " +a " ( src ) , " +c " ( dst ) , " +d " ( src2 ) , " +g " ( h ) \
: " D " ( ( long ) src2Stride ) , " S " ( ( long ) dstStride ) , \
" m " ( ff_pw_5 ) , " m " ( ff_pw_16 ) \
: " memory " \
) ; \
}
# else // ARCH_X86_64
# define QPEL_H264_HL2_XMM(OPNAME, OP, MMX)\
static av_noinline void OPNAME # # h264_qpel16_h_lowpass_l2_ # # MMX ( uint8_t * dst , uint8_t * src , uint8_t * src2 , int dstStride , int src2Stride ) { \
OPNAME # # h264_qpel8_h_lowpass_l2_ # # MMX ( dst , src , src2 , dstStride , src2Stride ) ; \
OPNAME # # h264_qpel8_h_lowpass_l2_ # # MMX ( dst + 8 , src + 8 , src2 + 8 , dstStride , src2Stride ) ; \
src + = 8 * dstStride ; \
dst + = 8 * dstStride ; \
src2 + = 8 * src2Stride ; \
OPNAME # # h264_qpel8_h_lowpass_l2_ # # MMX ( dst , src , src2 , dstStride , src2Stride ) ; \
OPNAME # # h264_qpel8_h_lowpass_l2_ # # MMX ( dst + 8 , src + 8 , src2 + 8 , dstStride , src2Stride ) ; \
}
# endif // ARCH_X86_64
# define QPEL_H264_XMM(OPNAME, OP, MMX)\
static av_noinline void OPNAME # # h264_qpel8_h_lowpass_l2_ # # MMX ( uint8_t * dst , uint8_t * src , uint8_t * src2 , int dstStride , int src2Stride ) { \
int h = 8 ; \
asm volatile ( \
" pxor %%xmm7, %%xmm7 \n \t " \
" movdqa %0, %%xmm6 \n \t " \
: : " m " ( ff_pw_5 ) \
) ; \
do { \
asm volatile ( \
" lddqu -5(%0), %%xmm1 \n \t " \
" movdqa %%xmm1, %%xmm0 \n \t " \
" punpckhbw %%xmm7, %%xmm1 \n \t " \
" punpcklbw %%xmm7, %%xmm0 \n \t " \
" movdqa %%xmm1, %%xmm2 \n \t " \
" movdqa %%xmm1, %%xmm3 \n \t " \
" movdqa %%xmm1, %%xmm4 \n \t " \
" movdqa %%xmm1, %%xmm5 \n \t " \
" palignr $6, %%xmm0, %%xmm5 \n \t " \
" palignr $8, %%xmm0, %%xmm4 \n \t " \
" palignr $10,%%xmm0, %%xmm3 \n \t " \
" paddw %%xmm1, %%xmm5 \n \t " \
" palignr $12,%%xmm0, %%xmm2 \n \t " \
" palignr $14,%%xmm0, %%xmm1 \n \t " \
" paddw %%xmm3, %%xmm2 \n \t " \
" paddw %%xmm4, %%xmm1 \n \t " \
" psllw $2, %%xmm2 \n \t " \
" movq (%2), %%xmm3 \n \t " \
" psubw %%xmm1, %%xmm2 \n \t " \
" paddw %5, %%xmm5 \n \t " \
" pmullw %%xmm6, %%xmm2 \n \t " \
" paddw %%xmm5, %%xmm2 \n \t " \
" psraw $5, %%xmm2 \n \t " \
" packuswb %%xmm2, %%xmm2 \n \t " \
" pavgb %%xmm3, %%xmm2 \n \t " \
OP ( % % xmm2 , ( % 1 ) , % % xmm4 , q ) \
" add %4, %0 \n \t " \
" add %4, %1 \n \t " \
" add %3, %2 \n \t " \
: " +a " ( src ) , " +c " ( dst ) , " +d " ( src2 ) \
: " D " ( ( long ) src2Stride ) , " S " ( ( long ) dstStride ) , \
" m " ( ff_pw_16 ) \
: " memory " \
) ; \
} while ( - - h ) ; \
} \
QPEL_H264_HL2_XMM ( OPNAME , OP , MMX ) \
\
static av_noinline void OPNAME # # h264_qpel8or16_v_lowpass_ # # MMX ( uint8_t * dst , uint8_t * src , int dstStride , int srcStride , int h ) { \
src - = 2 * srcStride ; \
\
asm volatile ( \
" pxor %%xmm7, %%xmm7 \n \t " \
" movq (%0), %%xmm0 \n \t " \
" add %2, %0 \n \t " \
" movq (%0), %%xmm1 \n \t " \
" add %2, %0 \n \t " \
" movq (%0), %%xmm2 \n \t " \
" add %2, %0 \n \t " \
" movq (%0), %%xmm3 \n \t " \
" add %2, %0 \n \t " \
" movq (%0), %%xmm4 \n \t " \
" add %2, %0 \n \t " \
" punpcklbw %%xmm7, %%xmm0 \n \t " \
" punpcklbw %%xmm7, %%xmm1 \n \t " \
" punpcklbw %%xmm7, %%xmm2 \n \t " \
" punpcklbw %%xmm7, %%xmm3 \n \t " \
" punpcklbw %%xmm7, %%xmm4 \n \t " \
QPEL_H264V_XMM ( % % xmm0 , % % xmm1 , % % xmm2 , % % xmm3 , % % xmm4 , % % xmm5 , OP ) \
QPEL_H264V_XMM ( % % xmm1 , % % xmm2 , % % xmm3 , % % xmm4 , % % xmm5 , % % xmm0 , OP ) \
QPEL_H264V_XMM ( % % xmm2 , % % xmm3 , % % xmm4 , % % xmm5 , % % xmm0 , % % xmm1 , OP ) \
QPEL_H264V_XMM ( % % xmm3 , % % xmm4 , % % xmm5 , % % xmm0 , % % xmm1 , % % xmm2 , OP ) \
QPEL_H264V_XMM ( % % xmm4 , % % xmm5 , % % xmm0 , % % xmm1 , % % xmm2 , % % xmm3 , OP ) \
QPEL_H264V_XMM ( % % xmm5 , % % xmm0 , % % xmm1 , % % xmm2 , % % xmm3 , % % xmm4 , OP ) \
QPEL_H264V_XMM ( % % xmm0 , % % xmm1 , % % xmm2 , % % xmm3 , % % xmm4 , % % xmm5 , OP ) \
QPEL_H264V_XMM ( % % xmm1 , % % xmm2 , % % xmm3 , % % xmm4 , % % xmm5 , % % xmm0 , OP ) \
\
: " +a " ( src ) , " +c " ( dst ) \
: " S " ( ( long ) srcStride ) , " D " ( ( long ) dstStride ) , " m " ( ff_pw_5 ) , " m " ( ff_pw_16 ) \
: " memory " \
) ; \
if ( h = = 16 ) { \
asm volatile ( \
QPEL_H264V_XMM ( % % xmm2 , % % xmm3 , % % xmm4 , % % xmm5 , % % xmm0 , % % xmm1 , OP ) \
QPEL_H264V_XMM ( % % xmm3 , % % xmm4 , % % xmm5 , % % xmm0 , % % xmm1 , % % xmm2 , OP ) \
QPEL_H264V_XMM ( % % xmm4 , % % xmm5 , % % xmm0 , % % xmm1 , % % xmm2 , % % xmm3 , OP ) \
QPEL_H264V_XMM ( % % xmm5 , % % xmm0 , % % xmm1 , % % xmm2 , % % xmm3 , % % xmm4 , OP ) \
QPEL_H264V_XMM ( % % xmm0 , % % xmm1 , % % xmm2 , % % xmm3 , % % xmm4 , % % xmm5 , OP ) \
QPEL_H264V_XMM ( % % xmm1 , % % xmm2 , % % xmm3 , % % xmm4 , % % xmm5 , % % xmm0 , OP ) \
QPEL_H264V_XMM ( % % xmm2 , % % xmm3 , % % xmm4 , % % xmm5 , % % xmm0 , % % xmm1 , OP ) \
QPEL_H264V_XMM ( % % xmm3 , % % xmm4 , % % xmm5 , % % xmm0 , % % xmm1 , % % xmm2 , OP ) \
\
: " +a " ( src ) , " +c " ( dst ) \
: " S " ( ( long ) srcStride ) , " D " ( ( long ) dstStride ) , " m " ( ff_pw_5 ) , " m " ( ff_pw_16 ) \
: " memory " \
) ; \
} \
} \
static av_noinline void OPNAME # # h264_qpel8or16_hv_lowpass_ # # MMX ( uint8_t * dst , int16_t * tmp , uint8_t * src , int dstStride , int tmpStride , int srcStride , int size ) { \
int h = size ; \
int w = ( size + 8 ) > > 3 ; \
src - = 2 * srcStride + 2 ; \
while ( w - - ) { \
asm volatile ( \
" pxor %%xmm7, %%xmm7 \n \t " \
" movq (%0), %%xmm0 \n \t " \
" add %2, %0 \n \t " \
" movq (%0), %%xmm1 \n \t " \
" add %2, %0 \n \t " \
" movq (%0), %%xmm2 \n \t " \
" add %2, %0 \n \t " \
" movq (%0), %%xmm3 \n \t " \
" add %2, %0 \n \t " \
" movq (%0), %%xmm4 \n \t " \
" add %2, %0 \n \t " \
" punpcklbw %%xmm7, %%xmm0 \n \t " \
" punpcklbw %%xmm7, %%xmm1 \n \t " \
" punpcklbw %%xmm7, %%xmm2 \n \t " \
" punpcklbw %%xmm7, %%xmm3 \n \t " \
" punpcklbw %%xmm7, %%xmm4 \n \t " \
QPEL_H264HV_XMM ( % % xmm0 , % % xmm1 , % % xmm2 , % % xmm3 , % % xmm4 , % % xmm5 , 0 * 48 ) \
QPEL_H264HV_XMM ( % % xmm1 , % % xmm2 , % % xmm3 , % % xmm4 , % % xmm5 , % % xmm0 , 1 * 48 ) \
QPEL_H264HV_XMM ( % % xmm2 , % % xmm3 , % % xmm4 , % % xmm5 , % % xmm0 , % % xmm1 , 2 * 48 ) \
QPEL_H264HV_XMM ( % % xmm3 , % % xmm4 , % % xmm5 , % % xmm0 , % % xmm1 , % % xmm2 , 3 * 48 ) \
QPEL_H264HV_XMM ( % % xmm4 , % % xmm5 , % % xmm0 , % % xmm1 , % % xmm2 , % % xmm3 , 4 * 48 ) \
QPEL_H264HV_XMM ( % % xmm5 , % % xmm0 , % % xmm1 , % % xmm2 , % % xmm3 , % % xmm4 , 5 * 48 ) \
QPEL_H264HV_XMM ( % % xmm0 , % % xmm1 , % % xmm2 , % % xmm3 , % % xmm4 , % % xmm5 , 6 * 48 ) \
QPEL_H264HV_XMM ( % % xmm1 , % % xmm2 , % % xmm3 , % % xmm4 , % % xmm5 , % % xmm0 , 7 * 48 ) \
: " +a " ( src ) \
: " c " ( tmp ) , " S " ( ( long ) srcStride ) , " m " ( ff_pw_5 ) , " m " ( ff_pw_16 ) \
: " memory " \
) ; \
if ( size = = 16 ) { \
asm volatile ( \
QPEL_H264HV_XMM ( % % xmm2 , % % xmm3 , % % xmm4 , % % xmm5 , % % xmm0 , % % xmm1 , 8 * 48 ) \
QPEL_H264HV_XMM ( % % xmm3 , % % xmm4 , % % xmm5 , % % xmm0 , % % xmm1 , % % xmm2 , 9 * 48 ) \
QPEL_H264HV_XMM ( % % xmm4 , % % xmm5 , % % xmm0 , % % xmm1 , % % xmm2 , % % xmm3 , 10 * 48 ) \
QPEL_H264HV_XMM ( % % xmm5 , % % xmm0 , % % xmm1 , % % xmm2 , % % xmm3 , % % xmm4 , 11 * 48 ) \
QPEL_H264HV_XMM ( % % xmm0 , % % xmm1 , % % xmm2 , % % xmm3 , % % xmm4 , % % xmm5 , 12 * 48 ) \
QPEL_H264HV_XMM ( % % xmm1 , % % xmm2 , % % xmm3 , % % xmm4 , % % xmm5 , % % xmm0 , 13 * 48 ) \
QPEL_H264HV_XMM ( % % xmm2 , % % xmm3 , % % xmm4 , % % xmm5 , % % xmm0 , % % xmm1 , 14 * 48 ) \
QPEL_H264HV_XMM ( % % xmm3 , % % xmm4 , % % xmm5 , % % xmm0 , % % xmm1 , % % xmm2 , 15 * 48 ) \
: " +a " ( src ) \
: " c " ( tmp ) , " S " ( ( long ) srcStride ) , " m " ( ff_pw_5 ) , " m " ( ff_pw_16 ) \
: " memory " \
) ; \
} \
tmp + = 8 ; \
src + = 8 - ( size + 5 ) * srcStride ; \
} \
tmp - = size + 8 ; \
h = size ; \
if ( size = = 16 ) { \
asm volatile ( \
" 1: \n \t " \
" movdqa 32(%0), %%xmm4 \n \t " \
" movdqa 16(%0), %%xmm5 \n \t " \
" movdqa (%0), %%xmm7 \n \t " \
" movdqa %%xmm4, %%xmm3 \n \t " \
" movdqa %%xmm4, %%xmm2 \n \t " \
" movdqa %%xmm4, %%xmm1 \n \t " \
" movdqa %%xmm4, %%xmm0 \n \t " \
" palignr $10, %%xmm5, %%xmm0 \n \t " \
" palignr $8, %%xmm5, %%xmm1 \n \t " \
" palignr $6, %%xmm5, %%xmm2 \n \t " \
" palignr $4, %%xmm5, %%xmm3 \n \t " \
" palignr $2, %%xmm5, %%xmm4 \n \t " \
" paddw %%xmm5, %%xmm0 \n \t " \
" paddw %%xmm4, %%xmm1 \n \t " \
" paddw %%xmm3, %%xmm2 \n \t " \
" movdqa %%xmm5, %%xmm6 \n \t " \
" movdqa %%xmm5, %%xmm4 \n \t " \
" movdqa %%xmm5, %%xmm3 \n \t " \
" palignr $8, %%xmm7, %%xmm4 \n \t " \
" palignr $2, %%xmm7, %%xmm6 \n \t " \
" palignr $10, %%xmm7, %%xmm3 \n \t " \
" paddw %%xmm6, %%xmm4 \n \t " \
" movdqa %%xmm5, %%xmm6 \n \t " \
" palignr $6, %%xmm7, %%xmm5 \n \t " \
" palignr $4, %%xmm7, %%xmm6 \n \t " \
" paddw %%xmm7, %%xmm3 \n \t " \
" paddw %%xmm6, %%xmm5 \n \t " \
\
" psubw %%xmm1, %%xmm0 \n \t " \
" psubw %%xmm4, %%xmm3 \n \t " \
" psraw $2, %%xmm0 \n \t " \
" psraw $2, %%xmm3 \n \t " \
" psubw %%xmm1, %%xmm0 \n \t " \
" psubw %%xmm4, %%xmm3 \n \t " \
" paddw %%xmm2, %%xmm0 \n \t " \
" paddw %%xmm5, %%xmm3 \n \t " \
" psraw $2, %%xmm0 \n \t " \
" psraw $2, %%xmm3 \n \t " \
" paddw %%xmm2, %%xmm0 \n \t " \
" paddw %%xmm5, %%xmm3 \n \t " \
" psraw $6, %%xmm0 \n \t " \
" psraw $6, %%xmm3 \n \t " \
" packuswb %%xmm0, %%xmm3 \n \t " \
OP ( % % xmm3 , ( % 1 ) , % % xmm7 , dqa ) \
" add $48, %0 \n \t " \
" add %3, %1 \n \t " \
" decl %2 \n \t " \
" jnz 1b \n \t " \
: " +a " ( tmp ) , " +c " ( dst ) , " +g " ( h ) \
: " S " ( ( long ) dstStride ) \
: " memory " \
) ; \
} else { \
asm volatile ( \
" 1: \n \t " \
" movdqa 16(%0), %%xmm1 \n \t " \
" movdqa (%0), %%xmm0 \n \t " \
" movdqa %%xmm1, %%xmm2 \n \t " \
" movdqa %%xmm1, %%xmm3 \n \t " \
" movdqa %%xmm1, %%xmm4 \n \t " \
" movdqa %%xmm1, %%xmm5 \n \t " \
" palignr $10, %%xmm0, %%xmm5 \n \t " \
" palignr $8, %%xmm0, %%xmm4 \n \t " \
" palignr $6, %%xmm0, %%xmm3 \n \t " \
" palignr $4, %%xmm0, %%xmm2 \n \t " \
" palignr $2, %%xmm0, %%xmm1 \n \t " \
" paddw %%xmm5, %%xmm0 \n \t " \
" paddw %%xmm4, %%xmm1 \n \t " \
" paddw %%xmm3, %%xmm2 \n \t " \
" psubw %%xmm1, %%xmm0 \n \t " \
" psraw $2, %%xmm0 \n \t " \
" psubw %%xmm1, %%xmm0 \n \t " \
" paddw %%xmm2, %%xmm0 \n \t " \
" psraw $2, %%xmm0 \n \t " \
" paddw %%xmm2, %%xmm0 \n \t " \
" psraw $6, %%xmm0 \n \t " \
" packuswb %%xmm0, %%xmm0 \n \t " \
OP ( % % xmm0 , ( % 1 ) , % % xmm7 , q ) \
" add $48, %0 \n \t " \
" add %3, %1 \n \t " \
" decl %2 \n \t " \
" jnz 1b \n \t " \
: " +a " ( tmp ) , " +c " ( dst ) , " +g " ( h ) \
: " S " ( ( long ) dstStride ) \
: " memory " \
) ; \
} \
} \
\
static void OPNAME # # h264_qpel8_v_lowpass_ # # MMX ( uint8_t * dst , uint8_t * src , int dstStride , int srcStride ) { \
OPNAME # # h264_qpel8or16_v_lowpass_ # # MMX ( dst , src , dstStride , srcStride , 8 ) ; \
} \
static av_noinline void OPNAME # # h264_qpel16_v_lowpass_ # # MMX ( uint8_t * dst , uint8_t * src , int dstStride , int srcStride ) { \
OPNAME # # h264_qpel8or16_v_lowpass_ # # MMX ( dst , src , dstStride , srcStride , 16 ) ; \
OPNAME # # h264_qpel8or16_v_lowpass_ # # MMX ( dst + 8 , src + 8 , dstStride , srcStride , 16 ) ; \
} \
static void OPNAME # # h264_qpel8_hv_lowpass_ # # MMX ( uint8_t * dst , int16_t * tmp , uint8_t * src , int dstStride , int tmpStride , int srcStride ) { \
OPNAME # # h264_qpel8or16_hv_lowpass_ # # MMX ( dst , tmp , src , dstStride , tmpStride , srcStride , 8 ) ; \
} \
static void OPNAME # # h264_qpel16_hv_lowpass_ # # MMX ( uint8_t * dst , int16_t * tmp , uint8_t * src , int dstStride , int tmpStride , int srcStride ) { \
OPNAME # # h264_qpel8or16_hv_lowpass_ # # MMX ( dst , tmp , src , dstStride , tmpStride , srcStride , 16 ) ; \
} \
# define put_pixels8_ssse3 put_pixels8_mmx2
# define put_pixels16_ssse3 put_pixels16_sse2
# define avg_pixels8_ssse3 avg_pixels8_mmx2
# define avg_pixels16_ssse3 avg_pixels16_sse2
# define put_pixels8_l2_ssse3 put_pixels8_l2_mmx2
# define put_pixels16_l2_ssse3 put_pixels16_l2_mmx2
# define avg_pixels8_l2_ssse3 avg_pixels8_l2_mmx2
# define avg_pixels16_l2_ssse3 avg_pixels16_l2_mmx2
# define put_pixels8_l2_shift5_ssse3 put_pixels8_l2_shift5_mmx2
# define put_pixels16_l2_shift5_ssse3 put_pixels16_l2_shift5_mmx2
# define avg_pixels8_l2_shift5_ssse3 avg_pixels8_l2_shift5_mmx2
# define avg_pixels16_l2_shift5_ssse3 avg_pixels16_l2_shift5_mmx2
# define put_h264_qpel8_h_lowpass_ssse3 put_h264_qpel8_h_lowpass_mmx2
# define put_h264_qpel16_h_lowpass_ssse3 put_h264_qpel16_h_lowpass_mmx2
# define avg_h264_qpel8_h_lowpass_ssse3 avg_h264_qpel8_h_lowpass_mmx2
# define avg_h264_qpel16_h_lowpass_ssse3 avg_h264_qpel16_h_lowpass_mmx2
# define H264_MC(OPNAME, SIZE, MMX) \
# define H264_MC(OPNAME, SIZE, MMX, ALIGN ) \
static void OPNAME # # h264_qpel # # SIZE # # _mc00_ # # MMX ( uint8_t * dst , uint8_t * src , int stride ) { \
OPNAME # # pixels # # SIZE # # _ # # MMX ( dst , src , stride , SIZE ) ; \
} \
@ -1311,7 +1681,7 @@ static void OPNAME ## h264_qpel ## SIZE ## _mc30_ ## MMX(uint8_t *dst, uint8_t *
} \
\
static void OPNAME # # h264_qpel # # SIZE # # _mc01_ # # MMX ( uint8_t * dst , uint8_t * src , int stride ) { \
uint64_t temp [ SIZE * SIZE / 8 ] ; \
DECLARE_ALIGNED ( ALIGN , uint64_t , temp [ SIZE * SIZE / 8 ] ) ; \
uint8_t * const half = ( uint8_t * ) temp ; \
put_h264_qpel # # SIZE # # _v_lowpass_ # # MMX ( half , src , SIZE , stride ) ; \
OPNAME # # pixels # # SIZE # # _l2_ # # MMX ( dst , src , half , stride , stride , SIZE ) ; \
@ -1322,48 +1692,48 @@ static void OPNAME ## h264_qpel ## SIZE ## _mc02_ ## MMX(uint8_t *dst, uint8_t *
} \
\
static void OPNAME # # h264_qpel # # SIZE # # _mc03_ # # MMX ( uint8_t * dst , uint8_t * src , int stride ) { \
uint64_t temp [ SIZE * SIZE / 8 ] ; \
DECLARE_ALIGNED ( ALIGN , uint64_t , temp [ SIZE * SIZE / 8 ] ) ; \
uint8_t * const half = ( uint8_t * ) temp ; \
put_h264_qpel # # SIZE # # _v_lowpass_ # # MMX ( half , src , SIZE , stride ) ; \
OPNAME # # pixels # # SIZE # # _l2_ # # MMX ( dst , src + stride , half , stride , stride , SIZE ) ; \
} \
\
static void OPNAME # # h264_qpel # # SIZE # # _mc11_ # # MMX ( uint8_t * dst , uint8_t * src , int stride ) { \
uint64_t temp [ SIZE * SIZE / 8 ] ; \
DECLARE_ALIGNED ( ALIGN , uint64_t , temp [ SIZE * SIZE / 8 ] ) ; \
uint8_t * const halfV = ( uint8_t * ) temp ; \
put_h264_qpel # # SIZE # # _v_lowpass_ # # MMX ( halfV , src , SIZE , stride ) ; \
OPNAME # # h264_qpel # # SIZE # # _h_lowpass_l2_ # # MMX ( dst , src , halfV , stride , SIZE ) ; \
} \
\
static void OPNAME # # h264_qpel # # SIZE # # _mc31_ # # MMX ( uint8_t * dst , uint8_t * src , int stride ) { \
uint64_t temp [ SIZE * SIZE / 8 ] ; \
DECLARE_ALIGNED ( ALIGN , uint64_t , temp [ SIZE * SIZE / 8 ] ) ; \
uint8_t * const halfV = ( uint8_t * ) temp ; \
put_h264_qpel # # SIZE # # _v_lowpass_ # # MMX ( halfV , src + 1 , SIZE , stride ) ; \
OPNAME # # h264_qpel # # SIZE # # _h_lowpass_l2_ # # MMX ( dst , src , halfV , stride , SIZE ) ; \
} \
\
static void OPNAME # # h264_qpel # # SIZE # # _mc13_ # # MMX ( uint8_t * dst , uint8_t * src , int stride ) { \
uint64_t temp [ SIZE * SIZE / 8 ] ; \
DECLARE_ALIGNED ( ALIGN , uint64_t , temp [ SIZE * SIZE / 8 ] ) ; \
uint8_t * const halfV = ( uint8_t * ) temp ; \
put_h264_qpel # # SIZE # # _v_lowpass_ # # MMX ( halfV , src , SIZE , stride ) ; \
OPNAME # # h264_qpel # # SIZE # # _h_lowpass_l2_ # # MMX ( dst , src + stride , halfV , stride , SIZE ) ; \
} \
\
static void OPNAME # # h264_qpel # # SIZE # # _mc33_ # # MMX ( uint8_t * dst , uint8_t * src , int stride ) { \
uint64_t temp [ SIZE * SIZE / 8 ] ; \
DECLARE_ALIGNED ( ALIGN , uint64_t , temp [ SIZE * SIZE / 8 ] ) ; \
uint8_t * const halfV = ( uint8_t * ) temp ; \
put_h264_qpel # # SIZE # # _v_lowpass_ # # MMX ( halfV , src + 1 , SIZE , stride ) ; \
OPNAME # # h264_qpel # # SIZE # # _h_lowpass_l2_ # # MMX ( dst , src + stride , halfV , stride , SIZE ) ; \
} \
\
static void OPNAME # # h264_qpel # # SIZE # # _mc22_ # # MMX ( uint8_t * dst , uint8_t * src , int stride ) { \
uint64_t temp [ SIZE * ( SIZE < 8 ? 12 : 24 ) / 4 ] ; \
DECLARE_ALIGNED ( ALIGN , uint64_t , temp [ SIZE * ( SIZE < 8 ? 12 : 24 ) / 4 ] ) ; \
int16_t * const tmp = ( int16_t * ) temp ; \
OPNAME # # h264_qpel # # SIZE # # _hv_lowpass_ # # MMX ( dst , tmp , src , stride , SIZE , stride ) ; \
} \
\
static void OPNAME # # h264_qpel # # SIZE # # _mc21_ # # MMX ( uint8_t * dst , uint8_t * src , int stride ) { \
uint64_t temp [ SIZE * ( SIZE < 8 ? 12 : 24 ) / 4 + SIZE * SIZE / 8 ] ; \
DECLARE_ALIGNED ( ALIGN , uint64_t , temp [ SIZE * ( SIZE < 8 ? 12 : 24 ) / 4 + SIZE * SIZE / 8 ] ) ; \
uint8_t * const halfHV = ( uint8_t * ) temp ; \
int16_t * const tmp = ( ( int16_t * ) temp ) + SIZE * SIZE / 2 ; \
assert ( ( ( int ) temp & 7 ) = = 0 ) ; \
@ -1372,7 +1742,7 @@ static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## MMX(uint8_t *dst, uint8_t *
} \
\
static void OPNAME # # h264_qpel # # SIZE # # _mc23_ # # MMX ( uint8_t * dst , uint8_t * src , int stride ) { \
uint64_t temp [ SIZE * ( SIZE < 8 ? 12 : 24 ) / 4 + SIZE * SIZE / 8 ] ; \
DECLARE_ALIGNED ( ALIGN , uint64_t , temp [ SIZE * ( SIZE < 8 ? 12 : 24 ) / 4 + SIZE * SIZE / 8 ] ) ; \
uint8_t * const halfHV = ( uint8_t * ) temp ; \
int16_t * const tmp = ( ( int16_t * ) temp ) + SIZE * SIZE / 2 ; \
assert ( ( ( int ) temp & 7 ) = = 0 ) ; \
@ -1381,7 +1751,7 @@ static void OPNAME ## h264_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *
} \
\
static void OPNAME # # h264_qpel # # SIZE # # _mc12_ # # MMX ( uint8_t * dst , uint8_t * src , int stride ) { \
uint64_t temp [ SIZE * ( SIZE < 8 ? 12 : 24 ) / 4 + SIZE * SIZE / 8 ] ; \
DECLARE_ALIGNED ( ALIGN , uint64_t , temp [ SIZE * ( SIZE < 8 ? 12 : 24 ) / 4 + SIZE * SIZE / 8 ] ) ; \
int16_t * const halfV = ( ( int16_t * ) temp ) + SIZE * SIZE / 2 ; \
uint8_t * const halfHV = ( ( uint8_t * ) temp ) ; \
assert ( ( ( int ) temp & 7 ) = = 0 ) ; \
@ -1390,7 +1760,7 @@ static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## MMX(uint8_t *dst, uint8_t *
} \
\
static void OPNAME # # h264_qpel # # SIZE # # _mc32_ # # MMX ( uint8_t * dst , uint8_t * src , int stride ) { \
uint64_t temp [ SIZE * ( SIZE < 8 ? 12 : 24 ) / 4 + SIZE * SIZE / 8 ] ; \
DECLARE_ALIGNED ( ALIGN , uint64_t , temp [ SIZE * ( SIZE < 8 ? 12 : 24 ) / 4 + SIZE * SIZE / 8 ] ) ; \
int16_t * const halfV = ( ( int16_t * ) temp ) + SIZE * SIZE / 2 ; \
uint8_t * const halfHV = ( ( uint8_t * ) temp ) ; \
assert ( ( ( int ) temp & 7 ) = = 0 ) ; \
@ -1415,20 +1785,37 @@ QPEL_H264(avg_, AVG_3DNOW_OP, 3dnow)
# define PAVGB "pavgb"
QPEL_H264 ( put_ , PUT_OP , mmx2 )
QPEL_H264 ( avg_ , AVG_MMX2_OP , mmx2 )
# ifdef HAVE_SSSE3
QPEL_H264_XMM ( put_ , PUT_OP , ssse3 )
QPEL_H264_XMM ( avg_ , AVG_MMX2_OP , ssse3 )
# endif
# undef PAVGB
H264_MC ( put_ , 4 , 3 dnow )
H264_MC ( put_ , 8 , 3 dnow )
H264_MC ( put_ , 16 , 3 dnow )
H264_MC ( avg_ , 4 , 3 dnow )
H264_MC ( avg_ , 8 , 3 dnow )
H264_MC ( avg_ , 16 , 3 dnow )
H264_MC ( put_ , 4 , mmx2 )
H264_MC ( put_ , 8 , mmx2 )
H264_MC ( put_ , 16 , mmx2 )
H264_MC ( avg_ , 4 , mmx2 )
H264_MC ( avg_ , 8 , mmx2 )
H264_MC ( avg_ , 16 , mmx2 )
H264_MC ( put_ , 4 , 3 dnow , 8 )
H264_MC ( put_ , 8 , 3 dnow , 8 )
H264_MC ( put_ , 16 , 3 dnow , 8 )
H264_MC ( avg_ , 4 , 3 dnow , 8 )
H264_MC ( avg_ , 8 , 3 dnow , 8 )
H264_MC ( avg_ , 16 , 3 dnow , 8 )
H264_MC ( put_ , 4 , mmx2 , 8 )
H264_MC ( put_ , 8 , mmx2 , 8 )
H264_MC ( put_ , 16 , mmx2 , 8 )
H264_MC ( avg_ , 4 , mmx2 , 8 )
H264_MC ( avg_ , 8 , mmx2 , 8 )
H264_MC ( avg_ , 16 , mmx2 , 8 )
# ifdef HAVE_SSSE3
// FIXME some of these only require sse2
H264_MC ( put_ , 8 , ssse3 , 16 )
H264_MC ( put_ , 16 , ssse3 , 16 )
H264_MC ( avg_ , 8 , ssse3 , 16 )
H264_MC ( avg_ , 16 , ssse3 , 16 )
# endif
// no mc20 because it wasn't significantly faster than mmx2
# define put_h264_qpel8_mc20_ssse3 put_h264_qpel8_mc20_mmx2
# define put_h264_qpel16_mc20_ssse3 put_h264_qpel16_mc20_mmx2
# define avg_h264_qpel8_mc20_ssse3 avg_h264_qpel8_mc20_mmx2
# define avg_h264_qpel16_mc20_ssse3 avg_h264_qpel16_mc20_mmx2
# define H264_CHROMA_OP(S,D)