@ -63,123 +63,119 @@ void ff_h264_idct_add8_sse2 (uint8_t **dest, const int *block_offset, DCTEL
/***********************************/
/* deblocking */
static av_always_inline void h264_loop_filter_strength_iteration_mmx2 ( int16_t bS [ 2 ] [ 4 ] [ 4 ] , uint8_t nnz [ 40 ] ,
int8_t ref [ 2 ] [ 40 ] , int16_t mv [ 2 ] [ 40 ] [ 2 ] ,
int bidir , int edges , int step ,
int mask_mv , int dir , const int d_idx ,
const uint64_t mask_dir )
{
x86_reg b_idx ;
mask_mv < < = 3 ;
for ( b_idx = 0 ; b_idx < edges ; b_idx + = step ) {
if ( ! mask_dir )
__asm__ volatile (
" pxor %%mm0, %%mm0 \n \t "
: :
) ;
if ( ! ( mask_mv & b_idx ) ) {
if ( bidir ) {
__asm__ volatile (
" movd %a3(%0,%2), %%mm2 \n "
" punpckldq %a4(%0,%2), %%mm2 \n " // { ref0[bn], ref1[bn] }
" pshufw $0x44, 12(%0,%2), %%mm0 \n " // { ref0[b], ref0[b] }
" pshufw $0x44, 52(%0,%2), %%mm1 \n " // { ref1[b], ref1[b] }
" pshufw $0x4E, %%mm2, %%mm3 \n "
" psubb %%mm2, %%mm0 \n " // { ref0[b]!=ref0[bn], ref0[b]!=ref1[bn] }
" psubb %%mm3, %%mm1 \n " // { ref1[b]!=ref1[bn], ref1[b]!=ref0[bn] }
" por %%mm1, %%mm0 \n "
" movq %a5(%1,%2,4), %%mm1 \n "
" movq %a6(%1,%2,4), %%mm2 \n "
" movq %%mm1, %%mm3 \n "
" movq %%mm2, %%mm4 \n "
" psubw 48(%1,%2,4), %%mm1 \n "
" psubw 56(%1,%2,4), %%mm2 \n "
" psubw 208(%1,%2,4), %%mm3 \n "
" psubw 216(%1,%2,4), %%mm4 \n "
" packsswb %%mm2, %%mm1 \n "
" packsswb %%mm4, %%mm3 \n "
" paddb %%mm6, %%mm1 \n "
" paddb %%mm6, %%mm3 \n "
" psubusb %%mm5, %%mm1 \n " // abs(mv[b] - mv[bn]) >= limit
" psubusb %%mm5, %%mm3 \n "
" packsswb %%mm3, %%mm1 \n "
" por %%mm1, %%mm0 \n "
" movq %a7(%1,%2,4), %%mm1 \n "
" movq %a8(%1,%2,4), %%mm2 \n "
" movq %%mm1, %%mm3 \n "
" movq %%mm2, %%mm4 \n "
" psubw 48(%1,%2,4), %%mm1 \n "
" psubw 56(%1,%2,4), %%mm2 \n "
" psubw 208(%1,%2,4), %%mm3 \n "
" psubw 216(%1,%2,4), %%mm4 \n "
" packsswb %%mm2, %%mm1 \n "
" packsswb %%mm4, %%mm3 \n "
" paddb %%mm6, %%mm1 \n "
" paddb %%mm6, %%mm3 \n "
" psubusb %%mm5, %%mm1 \n " // abs(mv[b] - mv[bn]) >= limit
" psubusb %%mm5, %%mm3 \n "
" packsswb %%mm3, %%mm1 \n "
" pshufw $0x4E, %%mm1, %%mm1 \n "
" por %%mm1, %%mm0 \n "
" pshufw $0x4E, %%mm0, %%mm1 \n "
" pminub %%mm1, %%mm0 \n "
: : " r " ( ref ) ,
" r " ( mv ) ,
" r " ( b_idx ) ,
" i " ( d_idx + 12 ) ,
" i " ( d_idx + 52 ) ,
" i " ( d_idx * 4 + 48 ) ,
" i " ( d_idx * 4 + 56 ) ,
" i " ( d_idx * 4 + 208 ) ,
" i " ( d_idx * 4 + 216 )
) ;
} else {
__asm__ volatile (
" movd 12(%0,%2), %%mm0 \n "
" psubb %a3(%0,%2), %%mm0 \n " // ref[b] != ref[bn]
" movq 48(%1,%2,4), %%mm1 \n "
" movq 56(%1,%2,4), %%mm2 \n "
" psubw %a4(%1,%2,4), %%mm1 \n "
" psubw %a5(%1,%2,4), %%mm2 \n "
" packsswb %%mm2, %%mm1 \n "
" paddb %%mm6, %%mm1 \n "
" psubusb %%mm5, %%mm1 \n " // abs(mv[b] - mv[bn]) >= limit
" packsswb %%mm1, %%mm1 \n "
" por %%mm1, %%mm0 \n "
: : " r " ( ref ) ,
" r " ( mv ) ,
" r " ( b_idx ) ,
" i " ( d_idx + 12 ) ,
" i " ( d_idx * 4 + 48 ) ,
" i " ( d_idx * 4 + 56 )
) ;
}
}
__asm__ volatile (
" movd 12(%0,%1), %%mm1 \n "
" por %a2(%0,%1), %%mm1 \n " // nnz[b] || nnz[bn]
: : " r " ( nnz ) ,
" r " ( b_idx ) ,
" i " ( d_idx + 12 )
) ;
__asm__ volatile (
" pminub %%mm7, %%mm1 \n "
" pminub %%mm7, %%mm0 \n "
" psllw $1, %%mm1 \n "
" pxor %%mm2, %%mm2 \n "
" pmaxub %%mm0, %%mm1 \n "
" punpcklbw %%mm2, %%mm1 \n "
" movq %%mm1, %a1(%0,%2) \n "
: : " r " ( bS ) ,
" i " ( 32 * dir ) ,
" r " ( b_idx )
: " memory "
) ;
}
}
# define h264_loop_filter_strength_iteration_mmx2(bS, nz, ref, mv, bidir, edges, step, mask_mv, dir, d_idx, mask_dir) \
do { \
x86_reg b_idx ; \
mask_mv < < = 3 ; \
for ( b_idx = 0 ; b_idx < edges ; b_idx + = step ) { \
if ( ! mask_dir ) \
__asm__ volatile ( \
" pxor %%mm0, %%mm0 \n \t " \
: : \
) ; \
if ( ! ( mask_mv & b_idx ) ) { \
if ( bidir ) { \
__asm__ volatile ( \
" movd %a3(%0,%2), %%mm2 \n " \
" punpckldq %a4(%0,%2), %%mm2 \n " /* { ref0[bn], ref1[bn] } */ \
" pshufw $0x44, 12(%0,%2), %%mm0 \n " /* { ref0[b], ref0[b] } */ \
" pshufw $0x44, 52(%0,%2), %%mm1 \n " /* { ref1[b], ref1[b] } */ \
" pshufw $0x4E, %%mm2, %%mm3 \n " \
" psubb %%mm2, %%mm0 \n " /* { ref0[b]!=ref0[bn], ref0[b]!=ref1[bn] } */ \
" psubb %%mm3, %%mm1 \n " /* { ref1[b]!=ref1[bn], ref1[b]!=ref0[bn] } */ \
\
" por %%mm1, %%mm0 \n " \
" movq %a5(%1,%2,4), %%mm1 \n " \
" movq %a6(%1,%2,4), %%mm2 \n " \
" movq %%mm1, %%mm3 \n " \
" movq %%mm2, %%mm4 \n " \
" psubw 48(%1,%2,4), %%mm1 \n " \
" psubw 56(%1,%2,4), %%mm2 \n " \
" psubw 208(%1,%2,4), %%mm3 \n " \
" psubw 216(%1,%2,4), %%mm4 \n " \
" packsswb %%mm2, %%mm1 \n " \
" packsswb %%mm4, %%mm3 \n " \
" paddb %%mm6, %%mm1 \n " \
" paddb %%mm6, %%mm3 \n " \
" psubusb %%mm5, %%mm1 \n " /* abs(mv[b] - mv[bn]) >= limit */ \
" psubusb %%mm5, %%mm3 \n " \
" packsswb %%mm3, %%mm1 \n " \
\
" por %%mm1, %%mm0 \n " \
" movq %a7(%1,%2,4), %%mm1 \n " \
" movq %a8(%1,%2,4), %%mm2 \n " \
" movq %%mm1, %%mm3 \n " \
" movq %%mm2, %%mm4 \n " \
" psubw 48(%1,%2,4), %%mm1 \n " \
" psubw 56(%1,%2,4), %%mm2 \n " \
" psubw 208(%1,%2,4), %%mm3 \n " \
" psubw 216(%1,%2,4), %%mm4 \n " \
" packsswb %%mm2, %%mm1 \n " \
" packsswb %%mm4, %%mm3 \n " \
" paddb %%mm6, %%mm1 \n " \
" paddb %%mm6, %%mm3 \n " \
" psubusb %%mm5, %%mm1 \n " /* abs(mv[b] - mv[bn]) >= limit */ \
" psubusb %%mm5, %%mm3 \n " \
" packsswb %%mm3, %%mm1 \n " \
\
" pshufw $0x4E, %%mm1, %%mm1 \n " \
" por %%mm1, %%mm0 \n " \
" pshufw $0x4E, %%mm0, %%mm1 \n " \
" pminub %%mm1, %%mm0 \n " \
: : " r " ( ref ) , \
" r " ( mv ) , \
" r " ( b_idx ) , \
" i " ( d_idx + 12 ) , \
" i " ( d_idx + 52 ) , \
" i " ( d_idx * 4 + 48 ) , \
" i " ( d_idx * 4 + 56 ) , \
" i " ( d_idx * 4 + 208 ) , \
" i " ( d_idx * 4 + 216 ) \
) ; \
} else { \
__asm__ volatile ( \
" movd 12(%0,%2), %%mm0 \n " \
" psubb %a3(%0,%2), %%mm0 \n " /* ref[b] != ref[bn] */ \
" movq 48(%1,%2,4), %%mm1 \n " \
" movq 56(%1,%2,4), %%mm2 \n " \
" psubw %a4(%1,%2,4), %%mm1 \n " \
" psubw %a5(%1,%2,4), %%mm2 \n " \
" packsswb %%mm2, %%mm1 \n " \
" paddb %%mm6, %%mm1 \n " \
" psubusb %%mm5, %%mm1 \n " /* abs(mv[b] - mv[bn]) >= limit */ \
" packsswb %%mm1, %%mm1 \n " \
" por %%mm1, %%mm0 \n " \
: : " r " ( ref ) , \
" r " ( mv ) , \
" r " ( b_idx ) , \
" i " ( d_idx + 12 ) , \
" i " ( d_idx * 4 + 48 ) , \
" i " ( d_idx * 4 + 56 ) \
) ; \
} \
} \
__asm__ volatile ( \
" movd 12(%0,%1), %%mm1 \n " \
" por %a2(%0,%1), %%mm1 \n " /* nnz[b] || nnz[bn] */ \
: : " r " ( nnz ) , \
" r " ( b_idx ) , \
" i " ( d_idx + 12 ) \
) ; \
__asm__ volatile ( \
" pminub %%mm7, %%mm1 \n " \
" pminub %%mm7, %%mm0 \n " \
" psllw $1, %%mm1 \n " \
" pxor %%mm2, %%mm2 \n " \
" pmaxub %%mm0, %%mm1 \n " \
" punpcklbw %%mm2, %%mm1 \n " \
" movq %%mm1, %a1(%0,%2) \n " \
: : " r " ( bS ) , \
" i " ( 32 * dir ) , \
" r " ( b_idx ) \
: " memory " \
) ; \
} \
} while ( 0 )
static void h264_loop_filter_strength_mmx2 ( int16_t bS [ 2 ] [ 4 ] [ 4 ] , uint8_t nnz [ 40 ] , int8_t ref [ 2 ] [ 40 ] , int16_t mv [ 2 ] [ 40 ] [ 2 ] ,
int bidir , int edges , int step , int mask_mv0 , int mask_mv1 , int field ) {