@ -30,8 +30,6 @@
# include "libavcodec/me_cmp.h"
# include "libavcodec/mpegvideo.h"
int ff_sum_abs_dctelem_mmx ( int16_t * block ) ;
int ff_sum_abs_dctelem_mmxext ( int16_t * block ) ;
int ff_sum_abs_dctelem_sse2 ( int16_t * block ) ;
int ff_sum_abs_dctelem_ssse3 ( int16_t * block ) ;
int ff_sse8_mmx ( MpegEncContext * v , uint8_t * pix1 , uint8_t * pix2 ,
@ -85,7 +83,6 @@ int ff_vsad16_approx_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
int ff_hadamard8_diff16_ # # cpu ( MpegEncContext * s , uint8_t * src1 , \
uint8_t * src2 , ptrdiff_t stride , int h ) ;
hadamard_func ( mmx )
hadamard_func ( mmxext )
hadamard_func ( sse2 )
hadamard_func ( ssse3 )
@ -126,232 +123,12 @@ static int nsse8_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2,
# if HAVE_INLINE_ASM
static int vsad_intra16_mmx ( MpegEncContext * v , uint8_t * pix , uint8_t * dummy ,
ptrdiff_t stride , int h )
{
int tmp ;
av_assert2 ( ( ( uintptr_t ) pix & 7 ) = = 0 ) ;
av_assert2 ( ( stride & 7 ) = = 0 ) ;
# define SUM(in0, in1, out0, out1) \
" movq (%0), %%mm2 \n " \
" movq 8(%0), %%mm3 \n " \
" add %2,%0 \n " \
" movq %%mm2, " # out0 " \n " \
" movq %%mm3, " # out1 " \n " \
" psubusb " # in0 " , %%mm2 \n " \
" psubusb " # in1 " , %%mm3 \n " \
" psubusb " # out0 " , " # in0 " \n " \
" psubusb " # out1 " , " # in1 " \n " \
" por %%mm2, " # in0 " \n " \
" por %%mm3, " # in1 " \n " \
" movq " # in0 " , %%mm2 \n " \
" movq " # in1 " , %%mm3 \n " \
" punpcklbw %%mm7, " # in0 " \n " \
" punpcklbw %%mm7, " # in1 " \n " \
" punpckhbw %%mm7, %%mm2 \n " \
" punpckhbw %%mm7, %%mm3 \n " \
" paddw " # in1 " , " # in0 " \n " \
" paddw %%mm3, %%mm2 \n " \
" paddw %%mm2, " # in0 " \n " \
" paddw " # in0 " , %%mm6 \n "
__asm__ volatile (
" movl %3, %%ecx \n "
" pxor %%mm6, %%mm6 \n "
" pxor %%mm7, %%mm7 \n "
" movq (%0), %%mm0 \n "
" movq 8(%0), %%mm1 \n "
" add %2, %0 \n "
" jmp 2f \n "
" 1: \n "
SUM ( % % mm4 , % % mm5 , % % mm0 , % % mm1 )
" 2: \n "
SUM ( % % mm0 , % % mm1 , % % mm4 , % % mm5 )
" subl $2, %%ecx \n "
" jnz 1b \n "
" movq %%mm6, %%mm0 \n "
" psrlq $32, %%mm6 \n "
" paddw %%mm6, %%mm0 \n "
" movq %%mm0, %%mm6 \n "
" psrlq $16, %%mm0 \n "
" paddw %%mm6, %%mm0 \n "
" movd %%mm0, %1 \n "
: " +r " ( pix ) , " =r " ( tmp )
: " r " ( stride ) , " m " ( h )
: " %ecx " ) ;
return tmp & 0xFFFF ;
}
# undef SUM
static int vsad16_mmx ( MpegEncContext * v , uint8_t * pix1 , uint8_t * pix2 ,
ptrdiff_t stride , int h )
{
int tmp ;
av_assert2 ( ( ( uintptr_t ) pix1 & 7 ) = = 0 ) ;
av_assert2 ( ( ( uintptr_t ) pix2 & 7 ) = = 0 ) ;
av_assert2 ( ( stride & 7 ) = = 0 ) ;
# define SUM(in0, in1, out0, out1) \
" movq (%0), %%mm2 \n " \
" movq (%1), " # out0 " \n " \
" movq 8(%0), %%mm3 \n " \
" movq 8(%1), " # out1 " \n " \
" add %3, %0 \n " \
" add %3, %1 \n " \
" psubb " # out0 " , %%mm2 \n " \
" psubb " # out1 " , %%mm3 \n " \
" pxor %%mm7, %%mm2 \n " \
" pxor %%mm7, %%mm3 \n " \
" movq %%mm2, " # out0 " \n " \
" movq %%mm3, " # out1 " \n " \
" psubusb " # in0 " , %%mm2 \n " \
" psubusb " # in1 " , %%mm3 \n " \
" psubusb " # out0 " , " # in0 " \n " \
" psubusb " # out1 " , " # in1 " \n " \
" por %%mm2, " # in0 " \n " \
" por %%mm3, " # in1 " \n " \
" movq " # in0 " , %%mm2 \n " \
" movq " # in1 " , %%mm3 \n " \
" punpcklbw %%mm7, " # in0 " \n " \
" punpcklbw %%mm7, " # in1 " \n " \
" punpckhbw %%mm7, %%mm2 \n " \
" punpckhbw %%mm7, %%mm3 \n " \
" paddw " # in1 " , " # in0 " \n " \
" paddw %%mm3, %%mm2 \n " \
" paddw %%mm2, " # in0 " \n " \
" paddw " # in0 " , %%mm6 \n "
__asm__ volatile (
" movl %4, %%ecx \n "
" pxor %%mm6, %%mm6 \n "
" pcmpeqw %%mm7, %%mm7 \n "
" psllw $15, %%mm7 \n "
" packsswb %%mm7, %%mm7 \n "
" movq (%0), %%mm0 \n "
" movq (%1), %%mm2 \n "
" movq 8(%0), %%mm1 \n "
" movq 8(%1), %%mm3 \n "
" add %3, %0 \n "
" add %3, %1 \n "
" psubb %%mm2, %%mm0 \n "
" psubb %%mm3, %%mm1 \n "
" pxor %%mm7, %%mm0 \n "
" pxor %%mm7, %%mm1 \n "
" jmp 2f \n "
" 1: \n "
SUM ( % % mm4 , % % mm5 , % % mm0 , % % mm1 )
" 2: \n "
SUM ( % % mm0 , % % mm1 , % % mm4 , % % mm5 )
" subl $2, %%ecx \n "
" jnz 1b \n "
" movq %%mm6, %%mm0 \n "
" psrlq $32, %%mm6 \n "
" paddw %%mm6, %%mm0 \n "
" movq %%mm0, %%mm6 \n "
" psrlq $16, %%mm0 \n "
" paddw %%mm6, %%mm0 \n "
" movd %%mm0, %2 \n "
: " +r " ( pix1 ) , " +r " ( pix2 ) , " =r " ( tmp )
: " r " ( stride ) , " m " ( h )
: " %ecx " ) ;
return tmp & 0x7FFF ;
}
# undef SUM
DECLARE_ASM_CONST ( 8 , uint64_t , round_tab ) [ 3 ] = {
0x0000000000000000ULL ,
0x0001000100010001ULL ,
0x0002000200020002ULL ,
} ;
static inline void sad8_1_mmx ( uint8_t * blk1 , uint8_t * blk2 ,
ptrdiff_t stride , int h )
{
x86_reg len = - stride * h ;
__asm__ volatile (
" .p2align 4 \n \t "
" 1: \n \t "
" movq (%1, %% " FF_REG_a " ), %%mm0 \n \t "
" movq (%2, %% " FF_REG_a " ), %%mm2 \n \t "
" movq (%2, %% " FF_REG_a " ), %%mm4 \n \t "
" add %3, %% " FF_REG_a " \n \t "
" psubusb %%mm0, %%mm2 \n \t "
" psubusb %%mm4, %%mm0 \n \t "
" movq (%1, %% " FF_REG_a " ), %%mm1 \n \t "
" movq (%2, %% " FF_REG_a " ), %%mm3 \n \t "
" movq (%2, %% " FF_REG_a " ), %%mm5 \n \t "
" psubusb %%mm1, %%mm3 \n \t "
" psubusb %%mm5, %%mm1 \n \t "
" por %%mm2, %%mm0 \n \t "
" por %%mm1, %%mm3 \n \t "
" movq %%mm0, %%mm1 \n \t "
" movq %%mm3, %%mm2 \n \t "
" punpcklbw %%mm7, %%mm0 \n \t "
" punpckhbw %%mm7, %%mm1 \n \t "
" punpcklbw %%mm7, %%mm3 \n \t "
" punpckhbw %%mm7, %%mm2 \n \t "
" paddw %%mm1, %%mm0 \n \t "
" paddw %%mm3, %%mm2 \n \t "
" paddw %%mm2, %%mm0 \n \t "
" paddw %%mm0, %%mm6 \n \t "
" add %3, %% " FF_REG_a " \n \t "
" js 1b \n \t "
: " +a " ( len )
: " r " ( blk1 - len ) , " r " ( blk2 - len ) , " r " ( stride ) ) ;
}
static inline void sad8_2_mmx ( uint8_t * blk1a , uint8_t * blk1b , uint8_t * blk2 ,
ptrdiff_t stride , int h )
{
x86_reg len = - stride * h ;
__asm__ volatile (
" .p2align 4 \n \t "
" 1: \n \t "
" movq (%1, %% " FF_REG_a " ), %%mm0 \n \t "
" movq (%2, %% " FF_REG_a " ), %%mm1 \n \t "
" movq (%1, %% " FF_REG_a " ), %%mm2 \n \t "
" movq (%2, %% " FF_REG_a " ), %%mm3 \n \t "
" punpcklbw %%mm7, %%mm0 \n \t "
" punpcklbw %%mm7, %%mm1 \n \t "
" punpckhbw %%mm7, %%mm2 \n \t "
" punpckhbw %%mm7, %%mm3 \n \t "
" paddw %%mm0, %%mm1 \n \t "
" paddw %%mm2, %%mm3 \n \t "
" movq (%3, %% " FF_REG_a " ), %%mm4 \n \t "
" movq (%3, %% " FF_REG_a " ), %%mm2 \n \t "
" paddw %%mm5, %%mm1 \n \t "
" paddw %%mm5, %%mm3 \n \t "
" psrlw $1, %%mm1 \n \t "
" psrlw $1, %%mm3 \n \t "
" packuswb %%mm3, %%mm1 \n \t "
" psubusb %%mm1, %%mm4 \n \t "
" psubusb %%mm2, %%mm1 \n \t "
" por %%mm4, %%mm1 \n \t "
" movq %%mm1, %%mm0 \n \t "
" punpcklbw %%mm7, %%mm0 \n \t "
" punpckhbw %%mm7, %%mm1 \n \t "
" paddw %%mm1, %%mm0 \n \t "
" paddw %%mm0, %%mm6 \n \t "
" add %4, %% " FF_REG_a " \n \t "
" js 1b \n \t "
: " +a " ( len )
: " r " ( blk1a - len ) , " r " ( blk1b - len ) , " r " ( blk2 - len ) ,
" r " ( stride ) ) ;
}
static inline void sad8_4_mmx ( uint8_t * blk1 , uint8_t * blk2 ,
ptrdiff_t stride , int h )
{
@ -421,63 +198,7 @@ static inline int sum_mmx(void)
return ret & 0xFFFF ;
}
static inline void sad8_x2a_mmx ( uint8_t * blk1 , uint8_t * blk2 ,
ptrdiff_t stride , int h )
{
sad8_2_mmx ( blk1 , blk1 + 1 , blk2 , stride , h ) ;
}
static inline void sad8_y2a_mmx ( uint8_t * blk1 , uint8_t * blk2 ,
ptrdiff_t stride , int h )
{
sad8_2_mmx ( blk1 , blk1 + stride , blk2 , stride , h ) ;
}
# define PIX_SAD(suf) \
static int sad8_ # # suf ( MpegEncContext * v , uint8_t * blk2 , \
uint8_t * blk1 , ptrdiff_t stride , int h ) \
{ \
av_assert2 ( h = = 8 ) ; \
__asm__ volatile ( \
" pxor %%mm7, %%mm7 \n \t " \
" pxor %%mm6, %%mm6 \n \t " \
: ) ; \
\
sad8_1_ # # suf ( blk1 , blk2 , stride , 8 ) ; \
\
return sum_ # # suf ( ) ; \
} \
\
static int sad8_x2_ # # suf ( MpegEncContext * v , uint8_t * blk2 , \
uint8_t * blk1 , ptrdiff_t stride , int h ) \
{ \
av_assert2 ( h = = 8 ) ; \
__asm__ volatile ( \
" pxor %%mm7, %%mm7 \n \t " \
" pxor %%mm6, %%mm6 \n \t " \
" movq %0, %%mm5 \n \t " \
: : " m " ( round_tab [ 1 ] ) ) ; \
\
sad8_x2a_ # # suf ( blk1 , blk2 , stride , 8 ) ; \
\
return sum_ # # suf ( ) ; \
} \
\
static int sad8_y2_ # # suf ( MpegEncContext * v , uint8_t * blk2 , \
uint8_t * blk1 , ptrdiff_t stride , int h ) \
{ \
av_assert2 ( h = = 8 ) ; \
__asm__ volatile ( \
" pxor %%mm7, %%mm7 \n \t " \
" pxor %%mm6, %%mm6 \n \t " \
" movq %0, %%mm5 \n \t " \
: : " m " ( round_tab [ 1 ] ) ) ; \
\
sad8_y2a_ # # suf ( blk1 , blk2 , stride , 8 ) ; \
\
return sum_ # # suf ( ) ; \
} \
\
# define PIX_SADXY(suf) \
static int sad8_xy2_ # # suf ( MpegEncContext * v , uint8_t * blk2 , \
uint8_t * blk1 , ptrdiff_t stride , int h ) \
{ \
@ -492,50 +213,6 @@ static int sad8_xy2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
return sum_ # # suf ( ) ; \
} \
\
static int sad16_ # # suf ( MpegEncContext * v , uint8_t * blk2 , \
uint8_t * blk1 , ptrdiff_t stride , int h ) \
{ \
__asm__ volatile ( \
" pxor %%mm7, %%mm7 \n \t " \
" pxor %%mm6, %%mm6 \n \t " \
: ) ; \
\
sad8_1_ # # suf ( blk1 , blk2 , stride , h ) ; \
sad8_1_ # # suf ( blk1 + 8 , blk2 + 8 , stride , h ) ; \
\
return sum_ # # suf ( ) ; \
} \
\
static int sad16_x2_ # # suf ( MpegEncContext * v , uint8_t * blk2 , \
uint8_t * blk1 , ptrdiff_t stride , int h ) \
{ \
__asm__ volatile ( \
" pxor %%mm7, %%mm7 \n \t " \
" pxor %%mm6, %%mm6 \n \t " \
" movq %0, %%mm5 \n \t " \
: : " m " ( round_tab [ 1 ] ) ) ; \
\
sad8_x2a_ # # suf ( blk1 , blk2 , stride , h ) ; \
sad8_x2a_ # # suf ( blk1 + 8 , blk2 + 8 , stride , h ) ; \
\
return sum_ # # suf ( ) ; \
} \
\
static int sad16_y2_ # # suf ( MpegEncContext * v , uint8_t * blk2 , \
uint8_t * blk1 , ptrdiff_t stride , int h ) \
{ \
__asm__ volatile ( \
" pxor %%mm7, %%mm7 \n \t " \
" pxor %%mm6, %%mm6 \n \t " \
" movq %0, %%mm5 \n \t " \
: : " m " ( round_tab [ 1 ] ) ) ; \
\
sad8_y2a_ # # suf ( blk1 , blk2 , stride , h ) ; \
sad8_y2a_ # # suf ( blk1 + 8 , blk2 + 8 , stride , h ) ; \
\
return sum_ # # suf ( ) ; \
} \
\
static int sad16_xy2_ # # suf ( MpegEncContext * v , uint8_t * blk2 , \
uint8_t * blk1 , ptrdiff_t stride , int h ) \
{ \
@ -550,7 +227,7 @@ static int sad16_xy2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
return sum_ # # suf ( ) ; \
} \
PIX_SAD ( mmx )
PIX_SADXY ( mmx )
# endif /* HAVE_INLINE_ASM */
@ -560,32 +237,13 @@ av_cold void ff_me_cmp_init_x86(MECmpContext *c, AVCodecContext *avctx)
# if HAVE_INLINE_ASM
if ( INLINE_MMX ( cpu_flags ) ) {
c - > pix_abs [ 0 ] [ 0 ] = sad16_mmx ;
c - > pix_abs [ 0 ] [ 1 ] = sad16_x2_mmx ;
c - > pix_abs [ 0 ] [ 2 ] = sad16_y2_mmx ;
c - > pix_abs [ 0 ] [ 3 ] = sad16_xy2_mmx ;
c - > pix_abs [ 1 ] [ 0 ] = sad8_mmx ;
c - > pix_abs [ 1 ] [ 1 ] = sad8_x2_mmx ;
c - > pix_abs [ 1 ] [ 2 ] = sad8_y2_mmx ;
c - > pix_abs [ 1 ] [ 3 ] = sad8_xy2_mmx ;
c - > sad [ 0 ] = sad16_mmx ;
c - > sad [ 1 ] = sad8_mmx ;
c - > vsad [ 4 ] = vsad_intra16_mmx ;
if ( ! ( avctx - > flags & AV_CODEC_FLAG_BITEXACT ) ) {
c - > vsad [ 0 ] = vsad16_mmx ;
}
}
# endif /* HAVE_INLINE_ASM */
if ( EXTERNAL_MMX ( cpu_flags ) ) {
c - > hadamard8_diff [ 0 ] = ff_hadamard8_diff16_mmx ;
c - > hadamard8_diff [ 1 ] = ff_hadamard8_diff_mmx ;
c - > sum_abs_dctelem = ff_sum_abs_dctelem_mmx ;
c - > sse [ 0 ] = ff_sse16_mmx ;
c - > sse [ 1 ] = ff_sse8_mmx ;
# if HAVE_X86ASM
c - > nsse [ 0 ] = nsse16_mmx ;
@ -594,9 +252,10 @@ av_cold void ff_me_cmp_init_x86(MECmpContext *c, AVCodecContext *avctx)
}
if ( EXTERNAL_MMXEXT ( cpu_flags ) ) {
# if !HAVE_ALIGNED_STACK
c - > hadamard8_diff [ 0 ] = ff_hadamard8_diff16_mmxext ;
c - > hadamard8_diff [ 1 ] = ff_hadamard8_diff_mmxext ;
c - > sum_abs_dctelem = ff_sum_abs_dctelem_mmxext ;
# endif
c - > sad [ 0 ] = ff_sad16_mmxext ;
c - > sad [ 1 ] = ff_sad8_mmxext ;