@ -879,55 +879,6 @@ static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *src1, c
* left = src2 [ w - 1 ] ;
}
# define DIFF_PIXELS_1(m,a,t,p1,p2)\
" mov " # m " " # p1 " , " # a " \n \t " \
" mov " # m " " # p2 " , " # t " \n \t " \
" punpcklbw " # a " , " # t " \n \t " \
" punpcklbw " # a " , " # a " \n \t " \
" psubw " # t " , " # a " \n \t " \
# define DIFF_PIXELS_8(m0,m1,mm,p1,p2,stride,temp) {\
uint8_t * p1b = p1 , * p2b = p2 ; \
__asm__ volatile ( \
DIFF_PIXELS_1 ( m0 , mm # # 0 , mm # # 7 , ( % 1 ) , ( % 2 ) ) \
DIFF_PIXELS_1 ( m0 , mm # # 1 , mm # # 7 , ( % 1 , % 3 ) , ( % 2 , % 3 ) ) \
DIFF_PIXELS_1 ( m0 , mm # # 2 , mm # # 7 , ( % 1 , % 3 , 2 ) , ( % 2 , % 3 , 2 ) ) \
" add %4, %1 \n \t " \
" add %4, %2 \n \t " \
DIFF_PIXELS_1 ( m0 , mm # # 3 , mm # # 7 , ( % 1 ) , ( % 2 ) ) \
DIFF_PIXELS_1 ( m0 , mm # # 4 , mm # # 7 , ( % 1 , % 3 ) , ( % 2 , % 3 ) ) \
DIFF_PIXELS_1 ( m0 , mm # # 5 , mm # # 7 , ( % 1 , % 3 , 2 ) , ( % 2 , % 3 , 2 ) ) \
DIFF_PIXELS_1 ( m0 , mm # # 6 , mm # # 7 , ( % 1 , % 4 ) , ( % 2 , % 4 ) ) \
" mov " # m1 " " # mm " 0, %0 \n \t " \
DIFF_PIXELS_1 ( m0 , mm # # 7 , mm # # 0 , ( % 1 , % 3 , 4 ) , ( % 2 , % 3 , 4 ) ) \
" mov " # m1 " %0, " # mm " 0 \n \t " \
: " +m " ( temp ) , " +r " ( p1b ) , " +r " ( p2b ) \
: " r " ( ( x86_reg ) stride ) , " r " ( ( x86_reg ) stride * 3 ) \
) ; \
}
//the "+m"(temp) is needed as gcc 2.95 sometimes fails to compile "=m"(temp)
# define DIFF_PIXELS_4x8(p1,p2,stride,temp) DIFF_PIXELS_8(d, q, %%mm, p1, p2, stride, temp)
# define DIFF_PIXELS_8x8(p1,p2,stride,temp) DIFF_PIXELS_8(q, dqa, %%xmm, p1, p2, stride, temp)
# define LBUTTERFLY2(a1,b1,a2,b2)\
" paddw " # b1 " , " # a1 " \n \t " \
" paddw " # b2 " , " # a2 " \n \t " \
" paddw " # b1 " , " # b1 " \n \t " \
" paddw " # b2 " , " # b2 " \n \t " \
" psubw " # a1 " , " # b1 " \n \t " \
" psubw " # a2 " , " # b2 " \n \t "
# define HADAMARD8(m0, m1, m2, m3, m4, m5, m6, m7)\
LBUTTERFLY2 ( m0 , m1 , m2 , m3 ) \
LBUTTERFLY2 ( m4 , m5 , m6 , m7 ) \
LBUTTERFLY2 ( m0 , m2 , m1 , m3 ) \
LBUTTERFLY2 ( m4 , m6 , m5 , m7 ) \
LBUTTERFLY2 ( m0 , m4 , m1 , m5 ) \
LBUTTERFLY2 ( m2 , m6 , m3 , m7 ) \
# define HADAMARD48 HADAMARD8(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm6, %%mm7)
# define MMABS_MMX(a,z)\
" pxor " # z " , " # z " \n \t " \
" pcmpgtw " # a " , " # z " \n \t " \
@ -946,34 +897,6 @@ static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *src1, c
MMABS ( a , z ) \
" paddusw " # a " , " # sum " \n \t "
# define MMABS_SUM_8x8_NOSPILL\
MMABS ( % % xmm0 , % % xmm8 ) \
MMABS ( % % xmm1 , % % xmm9 ) \
MMABS_SUM ( % % xmm2 , % % xmm8 , % % xmm0 ) \
MMABS_SUM ( % % xmm3 , % % xmm9 , % % xmm1 ) \
MMABS_SUM ( % % xmm4 , % % xmm8 , % % xmm0 ) \
MMABS_SUM ( % % xmm5 , % % xmm9 , % % xmm1 ) \
MMABS_SUM ( % % xmm6 , % % xmm8 , % % xmm0 ) \
MMABS_SUM ( % % xmm7 , % % xmm9 , % % xmm1 ) \
" paddusw %%xmm1, %%xmm0 \n \t "
# if ARCH_X86_64
# define MMABS_SUM_8x8_SSE2 MMABS_SUM_8x8_NOSPILL
# else
# define MMABS_SUM_8x8_SSE2\
" movdqa %%xmm7, (%1) \n \t " \
MMABS ( % % xmm0 , % % xmm7 ) \
MMABS ( % % xmm1 , % % xmm7 ) \
MMABS_SUM ( % % xmm2 , % % xmm7 , % % xmm0 ) \
MMABS_SUM ( % % xmm3 , % % xmm7 , % % xmm1 ) \
MMABS_SUM ( % % xmm4 , % % xmm7 , % % xmm0 ) \
MMABS_SUM ( % % xmm5 , % % xmm7 , % % xmm1 ) \
MMABS_SUM ( % % xmm6 , % % xmm7 , % % xmm0 ) \
" movdqa (%1), %%xmm2 \n \t " \
MMABS_SUM ( % % xmm2 , % % xmm7 , % % xmm1 ) \
" paddusw %%xmm1, %%xmm0 \n \t "
# endif
/* FIXME: HSUM_* saturates at 64k, while an 8x8 hadamard or dct block can get up to
* about 100 k on extreme inputs . But that ' s very unlikely to occur in natural video ,
* and it ' s even more unlikely to not have any alternative mvs / modes with lower cost . */
@ -1002,133 +925,16 @@ static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *src1, c
" paddusw " # t " , " # a " \n \t " \
" movd " # a " , " # dst " \n \t " \
# define HADAMARD8_DIFF_MMX(cpu) \
static int hadamard8_diff_ # # cpu ( void * s , uint8_t * src1 , uint8_t * src2 , int stride , int h ) { \
DECLARE_ALIGNED ( 8 , uint64_t , temp ) [ 13 ] ; \
int sum ; \
\
assert ( h = = 8 ) ; \
\
DIFF_PIXELS_4x8 ( src1 , src2 , stride , temp [ 0 ] ) ; \
\
__asm__ volatile ( \
HADAMARD48 \
\
" movq %%mm7, 96(%1) \n \t " \
\
TRANSPOSE4 ( % % mm0 , % % mm1 , % % mm2 , % % mm3 , % % mm7 ) \
STORE4 ( 8 , 0 ( % 1 ) , % % mm0 , % % mm3 , % % mm7 , % % mm2 ) \
\
" movq 96(%1), %%mm7 \n \t " \
TRANSPOSE4 ( % % mm4 , % % mm5 , % % mm6 , % % mm7 , % % mm0 ) \
STORE4 ( 8 , 64 ( % 1 ) , % % mm4 , % % mm7 , % % mm0 , % % mm6 ) \
\
: " =r " ( sum ) \
: " r " ( temp ) \
) ; \
\
DIFF_PIXELS_4x8 ( src1 + 4 , src2 + 4 , stride , temp [ 4 ] ) ; \
\
__asm__ volatile ( \
HADAMARD48 \
\
" movq %%mm7, 96(%1) \n \t " \
\
TRANSPOSE4 ( % % mm0 , % % mm1 , % % mm2 , % % mm3 , % % mm7 ) \
STORE4 ( 8 , 32 ( % 1 ) , % % mm0 , % % mm3 , % % mm7 , % % mm2 ) \
\
" movq 96(%1), %%mm7 \n \t " \
TRANSPOSE4 ( % % mm4 , % % mm5 , % % mm6 , % % mm7 , % % mm0 ) \
" movq %%mm7, %%mm5 \n \t " /*FIXME remove*/ \
" movq %%mm6, %%mm7 \n \t " \
" movq %%mm0, %%mm6 \n \t " \
\
LOAD4 ( 8 , 64 ( % 1 ) , % % mm0 , % % mm1 , % % mm2 , % % mm3 ) \
\
HADAMARD48 \
" movq %%mm7, 64(%1) \n \t " \
MMABS ( % % mm0 , % % mm7 ) \
MMABS ( % % mm1 , % % mm7 ) \
MMABS_SUM ( % % mm2 , % % mm7 , % % mm0 ) \
MMABS_SUM ( % % mm3 , % % mm7 , % % mm1 ) \
MMABS_SUM ( % % mm4 , % % mm7 , % % mm0 ) \
MMABS_SUM ( % % mm5 , % % mm7 , % % mm1 ) \
MMABS_SUM ( % % mm6 , % % mm7 , % % mm0 ) \
" movq 64(%1), %%mm2 \n \t " \
MMABS_SUM ( % % mm2 , % % mm7 , % % mm1 ) \
" paddusw %%mm1, %%mm0 \n \t " \
" movq %%mm0, 64(%1) \n \t " \
\
LOAD4 ( 8 , 0 ( % 1 ) , % % mm0 , % % mm1 , % % mm2 , % % mm3 ) \
LOAD4 ( 8 , 32 ( % 1 ) , % % mm4 , % % mm5 , % % mm6 , % % mm7 ) \
\
HADAMARD48 \
" movq %%mm7, (%1) \n \t " \
MMABS ( % % mm0 , % % mm7 ) \
MMABS ( % % mm1 , % % mm7 ) \
MMABS_SUM ( % % mm2 , % % mm7 , % % mm0 ) \
MMABS_SUM ( % % mm3 , % % mm7 , % % mm1 ) \
MMABS_SUM ( % % mm4 , % % mm7 , % % mm0 ) \
MMABS_SUM ( % % mm5 , % % mm7 , % % mm1 ) \
MMABS_SUM ( % % mm6 , % % mm7 , % % mm0 ) \
" movq (%1), %%mm2 \n \t " \
MMABS_SUM ( % % mm2 , % % mm7 , % % mm1 ) \
" paddusw 64(%1), %%mm0 \n \t " \
" paddusw %%mm1, %%mm0 \n \t " \
\
HSUM ( % % mm0 , % % mm1 , % 0 ) \
\
: " =r " ( sum ) \
: " r " ( temp ) \
) ; \
return sum & 0xFFFF ; \
} \
WRAPPER8_16_SQ ( hadamard8_diff_ # # cpu , hadamard8_diff16_ # # cpu )
# define HADAMARD8_DIFF_SSE2(cpu) \
static int hadamard8_diff_ # # cpu ( void * s , uint8_t * src1 , uint8_t * src2 , int stride , int h ) { \
DECLARE_ALIGNED ( 16 , uint64_t , temp ) [ 4 ] ; \
int sum ; \
\
assert ( h = = 8 ) ; \
\
DIFF_PIXELS_8x8 ( src1 , src2 , stride , temp [ 0 ] ) ; \
\
__asm__ volatile ( \
HADAMARD8 ( % % xmm0 , % % xmm1 , % % xmm2 , % % xmm3 , % % xmm4 , % % xmm5 , % % xmm6 , % % xmm7 ) \
TRANSPOSE8 ( % % xmm0 , % % xmm1 , % % xmm2 , % % xmm3 , % % xmm4 , % % xmm5 , % % xmm6 , % % xmm7 , ( % 1 ) ) \
HADAMARD8 ( % % xmm0 , % % xmm5 , % % xmm7 , % % xmm3 , % % xmm6 , % % xmm4 , % % xmm2 , % % xmm1 ) \
MMABS_SUM_8x8 \
HSUM_SSE2 ( % % xmm0 , % % xmm1 , % 0 ) \
: " =r " ( sum ) \
: " r " ( temp ) \
) ; \
return sum & 0xFFFF ; \
} \
WRAPPER8_16_SQ ( hadamard8_diff_ # # cpu , hadamard8_diff16_ # # cpu )
# define hadamard_func(cpu) \
int ff_hadamard8_diff_ # # cpu ( void * s , uint8_t * src1 , uint8_t * src2 , \
int stride , int h ) ; \
int ff_hadamard8_diff16_ # # cpu ( void * s , uint8_t * src1 , uint8_t * src2 , \
int stride , int h ) ;
# define MMABS(a,z) MMABS_MMX(a,z)
# define HSUM(a,t,dst) HSUM_MMX(a,t,dst)
HADAMARD8_DIFF_MMX ( mmx )
# undef MMABS
# undef HSUM
# define MMABS(a,z) MMABS_MMX2(a,z)
# define MMABS_SUM_8x8 MMABS_SUM_8x8_SSE2
# define HSUM(a,t,dst) HSUM_MMX2(a,t,dst)
HADAMARD8_DIFF_MMX ( mmx2 )
HADAMARD8_DIFF_SSE2 ( sse2 )
# undef MMABS
# undef MMABS_SUM_8x8
# undef HSUM
# if HAVE_SSSE3
# define MMABS(a,z) MMABS_SSSE3(a,z)
# define MMABS_SUM_8x8 MMABS_SUM_8x8_NOSPILL
HADAMARD8_DIFF_SSE2 ( ssse3 )
# undef MMABS
# undef MMABS_SUM_8x8
# endif
hadamard_func ( mmx )
hadamard_func ( mmx2 )
hadamard_func ( sse2 )
hadamard_func ( ssse3 )
# define DCT_SAD4(m,mm,o)\
" mov " # m " " # o " + 0(%1), " # mm " 2 \n \t " \
@ -1312,8 +1118,8 @@ void dsputilenc_init_mmx(DSPContext* c, AVCodecContext *avctx)
c - > diff_bytes = diff_bytes_mmx ;
c - > sum_abs_dctelem = sum_abs_dctelem_mmx ;
c - > hadamard8_diff [ 0 ] = hadamard8_diff16_mmx ;
c - > hadamard8_diff [ 1 ] = hadamard8_diff_mmx ;
c - > hadamard8_diff [ 0 ] = ff_ hadamard8_diff16_mmx;
c - > hadamard8_diff [ 1 ] = ff_ hadamard8_diff_mmx;
c - > pix_norm1 = pix_norm1_mmx ;
c - > sse [ 0 ] = ( mm_flags & AV_CPU_FLAG_SSE2 ) ? ff_sse16_sse2 : sse16_mmx ;
@ -1336,8 +1142,8 @@ void dsputilenc_init_mmx(DSPContext* c, AVCodecContext *avctx)
if ( mm_flags & AV_CPU_FLAG_MMX2 ) {
c - > sum_abs_dctelem = sum_abs_dctelem_mmx2 ;
c - > hadamard8_diff [ 0 ] = hadamard8_diff16_mmx2 ;
c - > hadamard8_diff [ 1 ] = hadamard8_diff_mmx2 ;
c - > hadamard8_diff [ 0 ] = ff_ hadamard8_diff16_mmx2;
c - > hadamard8_diff [ 1 ] = ff_ hadamard8_diff_mmx2;
c - > vsad [ 4 ] = vsad_intra16_mmx2 ;
if ( ! ( avctx - > flags & CODEC_FLAG_BITEXACT ) ) {
@ -1350,8 +1156,8 @@ void dsputilenc_init_mmx(DSPContext* c, AVCodecContext *avctx)
if ( mm_flags & AV_CPU_FLAG_SSE2 ) {
c - > get_pixels = get_pixels_sse2 ;
c - > sum_abs_dctelem = sum_abs_dctelem_sse2 ;
c - > hadamard8_diff [ 0 ] = hadamard8_diff16_sse2 ;
c - > hadamard8_diff [ 1 ] = hadamard8_diff_sse2 ;
c - > hadamard8_diff [ 0 ] = ff_ hadamard8_diff16_sse2;
c - > hadamard8_diff [ 1 ] = ff_ hadamard8_diff_sse2;
}
if ( CONFIG_LPC & & mm_flags & ( AV_CPU_FLAG_SSE2 | AV_CPU_FLAG_SSE2SLOW ) ) {
@ -1365,8 +1171,8 @@ void dsputilenc_init_mmx(DSPContext* c, AVCodecContext *avctx)
}
c - > add_8x8basis = add_8x8basis_ssse3 ;
c - > sum_abs_dctelem = sum_abs_dctelem_ssse3 ;
c - > hadamard8_diff [ 0 ] = hadamard8_diff16_ssse3 ;
c - > hadamard8_diff [ 1 ] = hadamard8_diff_ssse3 ;
c - > hadamard8_diff [ 0 ] = ff_ hadamard8_diff16_ssse3;
c - > hadamard8_diff [ 1 ] = ff_ hadamard8_diff_ssse3;
}
# endif