@ -63,123 +63,119 @@ void ff_h264_idct_add8_sse2 (uint8_t **dest, const int *block_offset, DCTEL
/***********************************/
/***********************************/
/* deblocking */
/* deblocking */
static av_always_inline void h264_loop_filter_strength_iteration_mmx2 ( int16_t bS [ 2 ] [ 4 ] [ 4 ] , uint8_t nnz [ 40 ] ,
# define h264_loop_filter_strength_iteration_mmx2(bS, nz, ref, mv, bidir, edges, step, mask_mv, dir, d_idx, mask_dir) \
int8_t ref [ 2 ] [ 40 ] , int16_t mv [ 2 ] [ 40 ] [ 2 ] ,
do { \
int bidir , int edges , int step ,
x86_reg b_idx ; \
int mask_mv , int dir , const int d_idx ,
mask_mv < < = 3 ; \
const uint64_t mask_dir )
for ( b_idx = 0 ; b_idx < edges ; b_idx + = step ) { \
{
if ( ! mask_dir ) \
x86_reg b_idx ;
__asm__ volatile ( \
mask_mv < < = 3 ;
" pxor %%mm0, %%mm0 \n \t " \
for ( b_idx = 0 ; b_idx < edges ; b_idx + = step ) {
: : \
if ( ! mask_dir )
) ; \
__asm__ volatile (
if ( ! ( mask_mv & b_idx ) ) { \
" pxor %%mm0, %%mm0 \n \t "
if ( bidir ) { \
: :
__asm__ volatile ( \
) ;
" movd %a3(%0,%2), %%mm2 \n " \
if ( ! ( mask_mv & b_idx ) ) {
" punpckldq %a4(%0,%2), %%mm2 \n " /* { ref0[bn], ref1[bn] } */ \
if ( bidir ) {
" pshufw $0x44, 12(%0,%2), %%mm0 \n " /* { ref0[b], ref0[b] } */ \
__asm__ volatile (
" pshufw $0x44, 52(%0,%2), %%mm1 \n " /* { ref1[b], ref1[b] } */ \
" movd %a3(%0,%2), %%mm2 \n "
" pshufw $0x4E, %%mm2, %%mm3 \n " \
" punpckldq %a4(%0,%2), %%mm2 \n " // { ref0[bn], ref1[bn] }
" psubb %%mm2, %%mm0 \n " /* { ref0[b]!=ref0[bn], ref0[b]!=ref1[bn] } */ \
" pshufw $0x44, 12(%0,%2), %%mm0 \n " // { ref0[b], ref0[b] }
" psubb %%mm3, %%mm1 \n " /* { ref1[b]!=ref1[bn], ref1[b]!=ref0[bn] } */ \
" pshufw $0x44, 52(%0,%2), %%mm1 \n " // { ref1[b], ref1[b] }
\
" pshufw $0x4E, %%mm2, %%mm3 \n "
" por %%mm1, %%mm0 \n " \
" psubb %%mm2, %%mm0 \n " // { ref0[b]!=ref0[bn], ref0[b]!=ref1[bn] }
" movq %a5(%1,%2,4), %%mm1 \n " \
" psubb %%mm3, %%mm1 \n " // { ref1[b]!=ref1[bn], ref1[b]!=ref0[bn] }
" movq %a6(%1,%2,4), %%mm2 \n " \
" movq %%mm1, %%mm3 \n " \
" por %%mm1, %%mm0 \n "
" movq %%mm2, %%mm4 \n " \
" movq %a5(%1,%2,4), %%mm1 \n "
" psubw 48(%1,%2,4), %%mm1 \n " \
" movq %a6(%1,%2,4), %%mm2 \n "
" psubw 56(%1,%2,4), %%mm2 \n " \
" movq %%mm1, %%mm3 \n "
" psubw 208(%1,%2,4), %%mm3 \n " \
" movq %%mm2, %%mm4 \n "
" psubw 216(%1,%2,4), %%mm4 \n " \
" psubw 48(%1,%2,4), %%mm1 \n "
" packsswb %%mm2, %%mm1 \n " \
" psubw 56(%1,%2,4), %%mm2 \n "
" packsswb %%mm4, %%mm3 \n " \
" psubw 208(%1,%2,4), %%mm3 \n "
" paddb %%mm6, %%mm1 \n " \
" psubw 216(%1,%2,4), %%mm4 \n "
" paddb %%mm6, %%mm3 \n " \
" packsswb %%mm2, %%mm1 \n "
" psubusb %%mm5, %%mm1 \n " /* abs(mv[b] - mv[bn]) >= limit */ \
" packsswb %%mm4, %%mm3 \n "
" psubusb %%mm5, %%mm3 \n " \
" paddb %%mm6, %%mm1 \n "
" packsswb %%mm3, %%mm1 \n " \
" paddb %%mm6, %%mm3 \n "
\
" psubusb %%mm5, %%mm1 \n " // abs(mv[b] - mv[bn]) >= limit
" por %%mm1, %%mm0 \n " \
" psubusb %%mm5, %%mm3 \n "
" movq %a7(%1,%2,4), %%mm1 \n " \
" packsswb %%mm3, %%mm1 \n "
" movq %a8(%1,%2,4), %%mm2 \n " \
" movq %%mm1, %%mm3 \n " \
" por %%mm1, %%mm0 \n "
" movq %%mm2, %%mm4 \n " \
" movq %a7(%1,%2,4), %%mm1 \n "
" psubw 48(%1,%2,4), %%mm1 \n " \
" movq %a8(%1,%2,4), %%mm2 \n "
" psubw 56(%1,%2,4), %%mm2 \n " \
" movq %%mm1, %%mm3 \n "
" psubw 208(%1,%2,4), %%mm3 \n " \
" movq %%mm2, %%mm4 \n "
" psubw 216(%1,%2,4), %%mm4 \n " \
" psubw 48(%1,%2,4), %%mm1 \n "
" packsswb %%mm2, %%mm1 \n " \
" psubw 56(%1,%2,4), %%mm2 \n "
" packsswb %%mm4, %%mm3 \n " \
" psubw 208(%1,%2,4), %%mm3 \n "
" paddb %%mm6, %%mm1 \n " \
" psubw 216(%1,%2,4), %%mm4 \n "
" paddb %%mm6, %%mm3 \n " \
" packsswb %%mm2, %%mm1 \n "
" psubusb %%mm5, %%mm1 \n " /* abs(mv[b] - mv[bn]) >= limit */ \
" packsswb %%mm4, %%mm3 \n "
" psubusb %%mm5, %%mm3 \n " \
" paddb %%mm6, %%mm1 \n "
" packsswb %%mm3, %%mm1 \n " \
" paddb %%mm6, %%mm3 \n "
\
" psubusb %%mm5, %%mm1 \n " // abs(mv[b] - mv[bn]) >= limit
" pshufw $0x4E, %%mm1, %%mm1 \n " \
" psubusb %%mm5, %%mm3 \n "
" por %%mm1, %%mm0 \n " \
" packsswb %%mm3, %%mm1 \n "
" pshufw $0x4E, %%mm0, %%mm1 \n " \
" pminub %%mm1, %%mm0 \n " \
" pshufw $0x4E, %%mm1, %%mm1 \n "
: : " r " ( ref ) , \
" por %%mm1, %%mm0 \n "
" r " ( mv ) , \
" pshufw $0x4E, %%mm0, %%mm1 \n "
" r " ( b_idx ) , \
" pminub %%mm1, %%mm0 \n "
" i " ( d_idx + 12 ) , \
: : " r " ( ref ) ,
" i " ( d_idx + 52 ) , \
" r " ( mv ) ,
" i " ( d_idx * 4 + 48 ) , \
" r " ( b_idx ) ,
" i " ( d_idx * 4 + 56 ) , \
" i " ( d_idx + 12 ) ,
" i " ( d_idx * 4 + 208 ) , \
" i " ( d_idx + 52 ) ,
" i " ( d_idx * 4 + 216 ) \
" i " ( d_idx * 4 + 48 ) ,
) ; \
" i " ( d_idx * 4 + 56 ) ,
} else { \
" i " ( d_idx * 4 + 208 ) ,
__asm__ volatile ( \
" i " ( d_idx * 4 + 216 )
" movd 12(%0,%2), %%mm0 \n " \
) ;
" psubb %a3(%0,%2), %%mm0 \n " /* ref[b] != ref[bn] */ \
} else {
" movq 48(%1,%2,4), %%mm1 \n " \
__asm__ volatile (
" movq 56(%1,%2,4), %%mm2 \n " \
" movd 12(%0,%2), %%mm0 \n "
" psubw %a4(%1,%2,4), %%mm1 \n " \
" psubb %a3(%0,%2), %%mm0 \n " // ref[b] != ref[bn]
" psubw %a5(%1,%2,4), %%mm2 \n " \
" movq 48(%1,%2,4), %%mm1 \n "
" packsswb %%mm2, %%mm1 \n " \
" movq 56(%1,%2,4), %%mm2 \n "
" paddb %%mm6, %%mm1 \n " \
" psubw %a4(%1,%2,4), %%mm1 \n "
" psubusb %%mm5, %%mm1 \n " /* abs(mv[b] - mv[bn]) >= limit */ \
" psubw %a5(%1,%2,4), %%mm2 \n "
" packsswb %%mm1, %%mm1 \n " \
" packsswb %%mm2, %%mm1 \n "
" por %%mm1, %%mm0 \n " \
" paddb %%mm6, %%mm1 \n "
: : " r " ( ref ) , \
" psubusb %%mm5, %%mm1 \n " // abs(mv[b] - mv[bn]) >= limit
" r " ( mv ) , \
" packsswb %%mm1, %%mm1 \n "
" r " ( b_idx ) , \
" por %%mm1, %%mm0 \n "
" i " ( d_idx + 12 ) , \
: : " r " ( ref ) ,
" i " ( d_idx * 4 + 48 ) , \
" r " ( mv ) ,
" i " ( d_idx * 4 + 56 ) \
" r " ( b_idx ) ,
) ; \
" i " ( d_idx + 12 ) ,
} \
" i " ( d_idx * 4 + 48 ) ,
} \
" i " ( d_idx * 4 + 56 )
__asm__ volatile ( \
) ;
" movd 12(%0,%1), %%mm1 \n " \
}
" por %a2(%0,%1), %%mm1 \n " /* nnz[b] || nnz[bn] */ \
}
: : " r " ( nnz ) , \
__asm__ volatile (
" r " ( b_idx ) , \
" movd 12(%0,%1), %%mm1 \n "
" i " ( d_idx + 12 ) \
" por %a2(%0,%1), %%mm1 \n " // nnz[b] || nnz[bn]
) ; \
: : " r " ( nnz ) ,
__asm__ volatile ( \
" r " ( b_idx ) ,
" pminub %%mm7, %%mm1 \n " \
" i " ( d_idx + 12 )
" pminub %%mm7, %%mm0 \n " \
) ;
" psllw $1, %%mm1 \n " \
__asm__ volatile (
" pxor %%mm2, %%mm2 \n " \
" pminub %%mm7, %%mm1 \n "
" pmaxub %%mm0, %%mm1 \n " \
" pminub %%mm7, %%mm0 \n "
" punpcklbw %%mm2, %%mm1 \n " \
" psllw $1, %%mm1 \n "
" movq %%mm1, %a1(%0,%2) \n " \
" pxor %%mm2, %%mm2 \n "
: : " r " ( bS ) , \
" pmaxub %%mm0, %%mm1 \n "
" i " ( 32 * dir ) , \
" punpcklbw %%mm2, %%mm1 \n "
" r " ( b_idx ) \
" movq %%mm1, %a1(%0,%2) \n "
: " memory " \
: : " r " ( bS ) ,
) ; \
" i " ( 32 * dir ) ,
} \
" r " ( b_idx )
} while ( 0 )
: " memory "
) ;
}
}
static void h264_loop_filter_strength_mmx2 ( int16_t bS [ 2 ] [ 4 ] [ 4 ] , uint8_t nnz [ 40 ] , int8_t ref [ 2 ] [ 40 ] , int16_t mv [ 2 ] [ 40 ] [ 2 ] ,
static void h264_loop_filter_strength_mmx2 ( int16_t bS [ 2 ] [ 4 ] [ 4 ] , uint8_t nnz [ 40 ] , int8_t ref [ 2 ] [ 40 ] , int16_t mv [ 2 ] [ 40 ] [ 2 ] ,
int bidir , int edges , int step , int mask_mv0 , int mask_mv1 , int field ) {
int bidir , int edges , int step , int mask_mv0 , int mask_mv1 , int field ) {