@ -653,3 +653,294 @@ void ff_snow_vertical_compose97i_mmx(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, DWTE
" m " ( b0 ) , " m " ( b1 ) , " m " ( b2 ) , " m " ( b3 ) , " m " ( b4 ) , " m " ( b5 ) :
" % " REG_a " " , " % " REG_b " " , " % " REG_c " " ) ;
}
# define snow_inner_add_yblock_sse2_header \
DWTELEM * * dst_array = sb - > line + src_y ; \
asm volatile ( \
" mov %6, %% " REG_c " \n \t " \
" mov %5, %% " REG_b " \n \t " \
" mov %3, %% " REG_S " \n \t " \
" pxor %%xmm7, %%xmm7 \n \t " /* 0 */ \
" pcmpeqd %%xmm3, %%xmm3 \n \t " \
" pslld $31, %%xmm3 \n \t " \
" psrld $24, %%xmm3 \n \t " /* FRAC_BITS >> 1 */ \
" 1: \n \t " \
" mov %1, %% " REG_D " \n \t " \
" mov (%% " REG_D " ), %% " REG_D " \n \t " \
" add %2, %% " REG_D " \n \t "
# define snow_inner_add_yblock_sse2_start_8(out_reg1, out_reg2, ptr_offset, s_offset)\
" mov " PTR_SIZE " * " ptr_offset " (%% " REG_a " ), %% " REG_d " ; \n \t " \
" movq (%% " REG_d " ), %% " out_reg1 " \n \t " \
" movq (%% " REG_d " , %% " REG_c " ), %% " out_reg2 " \n \t " \
" punpcklbw %%xmm7, %% " out_reg1 " \n \t " \
" punpcklbw %%xmm7, %% " out_reg2 " \n \t " \
" movq " s_offset " (%% " REG_S " ), %%xmm0 \n \t " \
" movq " s_offset " +16(%% " REG_S " ), %%xmm4 \n \t " \
" punpcklbw %%xmm7, %%xmm0 \n \t " \
" punpcklbw %%xmm7, %%xmm4 \n \t " \
" pmullw %%xmm0, %% " out_reg1 " \n \t " \
" pmullw %%xmm4, %% " out_reg2 " \n \t "
# define snow_inner_add_yblock_sse2_start_16(out_reg1, out_reg2, ptr_offset, s_offset)\
" mov " PTR_SIZE " * " ptr_offset " (%% " REG_a " ), %% " REG_d " ; \n \t " \
" movq (%% " REG_d " ), %% " out_reg1 " \n \t " \
" movq 8(%% " REG_d " ), %% " out_reg2 " \n \t " \
" punpcklbw %%xmm7, %% " out_reg1 " \n \t " \
" punpcklbw %%xmm7, %% " out_reg2 " \n \t " \
" movq " s_offset " (%% " REG_S " ), %%xmm0 \n \t " \
" movq " s_offset " +8(%% " REG_S " ), %%xmm4 \n \t " \
" punpcklbw %%xmm7, %%xmm0 \n \t " \
" punpcklbw %%xmm7, %%xmm4 \n \t " \
" pmullw %%xmm0, %% " out_reg1 " \n \t " \
" pmullw %%xmm4, %% " out_reg2 " \n \t "
# define snow_inner_add_yblock_sse2_accum_8(ptr_offset, s_offset) \
snow_inner_add_yblock_sse2_start_8 ( " xmm2 " , " xmm6 " , ptr_offset , s_offset ) \
" paddusw %%xmm2, %%xmm1 \n \t " \
" paddusw %%xmm6, %%xmm5 \n \t "
# define snow_inner_add_yblock_sse2_accum_16(ptr_offset, s_offset) \
snow_inner_add_yblock_sse2_start_16 ( " xmm2 " , " xmm6 " , ptr_offset , s_offset ) \
" paddusw %%xmm2, %%xmm1 \n \t " \
" paddusw %%xmm6, %%xmm5 \n \t "
# define snow_inner_add_yblock_sse2_end_common1\
" add $32, %% " REG_S " \n \t " \
" add %% " REG_c " , %0 \n \t " \
" add %% " REG_c " , " PTR_SIZE " *3(%% " REG_a " ); \n \t " \
" add %% " REG_c " , " PTR_SIZE " *2(%% " REG_a " ); \n \t " \
" add %% " REG_c " , " PTR_SIZE " *1(%% " REG_a " ); \n \t " \
" add %% " REG_c " , (%% " REG_a " ) \n \t "
# define snow_inner_add_yblock_sse2_end_common2\
" jnz 1b \n \t " \
: " +m " ( dst8 ) , " +m " ( dst_array ) \
: \
" rm " ( ( long ) ( src_x < < 2 ) ) , " m " ( obmc ) , " a " ( block ) , " m " ( ( long ) b_h ) , " rm " ( ( long ) src_stride ) : \
" % " REG_b " " , " % " REG_c " " , " % " REG_S " " , " % " REG_D " " , " % " REG_d " " ) ;
# define snow_inner_add_yblock_sse2_end_8\
" sal $1, %% " REG_c " \n \t " \
" add $ " PTR_SIZE " *2, %1 \n \t " \
snow_inner_add_yblock_sse2_end_common1 \
" sar $1, %% " REG_c " \n \t " \
" sub $2, %% " REG_b " \n \t " \
snow_inner_add_yblock_sse2_end_common2
# define snow_inner_add_yblock_sse2_end_16\
" add $ " PTR_SIZE " *1, %1 \n \t " \
snow_inner_add_yblock_sse2_end_common1 \
" dec %% " REG_b " \n \t " \
snow_inner_add_yblock_sse2_end_common2
static void inner_add_yblock_bw_8_obmc_16_bh_even_sse2 ( uint8_t * obmc , const long obmc_stride , uint8_t * * block , int b_w , long b_h ,
int src_x , int src_y , long src_stride , slice_buffer * sb , int add , uint8_t * dst8 ) {
snow_inner_add_yblock_sse2_header
snow_inner_add_yblock_sse2_start_8 ( " xmm1 " , " xmm5 " , " 3 " , " 0 " )
snow_inner_add_yblock_sse2_accum_8 ( " 2 " , " 8 " )
snow_inner_add_yblock_sse2_accum_8 ( " 1 " , " 128 " )
snow_inner_add_yblock_sse2_accum_8 ( " 0 " , " 136 " )
" mov %0, %% " REG_d " \n \t "
" movdqa (%% " REG_D " ), %%xmm0 \n \t "
" movdqa %%xmm1, %%xmm2 \n \t "
" punpckhwd %%xmm7, %%xmm1 \n \t "
" punpcklwd %%xmm7, %%xmm2 \n \t "
" paddd %%xmm2, %%xmm0 \n \t "
" movdqa 16(%% " REG_D " ), %%xmm2 \n \t "
" paddd %%xmm1, %%xmm2 \n \t "
" paddd %%xmm3, %%xmm0 \n \t "
" paddd %%xmm3, %%xmm2 \n \t "
" mov %1, %% " REG_D " \n \t "
" mov " PTR_SIZE " (%% " REG_D " ), %% " REG_D " ; \n \t "
" add %2, %% " REG_D " \n \t "
" movdqa (%% " REG_D " ), %%xmm4 \n \t "
" movdqa %%xmm5, %%xmm6 \n \t "
" punpckhwd %%xmm7, %%xmm5 \n \t "
" punpcklwd %%xmm7, %%xmm6 \n \t "
" paddd %%xmm6, %%xmm4 \n \t "
" movdqa 16(%% " REG_D " ), %%xmm6 \n \t "
" paddd %%xmm5, %%xmm6 \n \t "
" paddd %%xmm3, %%xmm4 \n \t "
" paddd %%xmm3, %%xmm6 \n \t "
" psrad $8, %%xmm0 \n \t " /* FRAC_BITS. */
" psrad $8, %%xmm2 \n \t " /* FRAC_BITS. */
" packssdw %%xmm2, %%xmm0 \n \t "
" packuswb %%xmm7, %%xmm0 \n \t "
" movq %%xmm0, (%% " REG_d " ) \n \t "
" psrad $8, %%xmm4 \n \t " /* FRAC_BITS. */
" psrad $8, %%xmm6 \n \t " /* FRAC_BITS. */
" packssdw %%xmm6, %%xmm4 \n \t "
" packuswb %%xmm7, %%xmm4 \n \t "
" movq %%xmm4, (%% " REG_d " ,%% " REG_c " ); \n \t "
snow_inner_add_yblock_sse2_end_8
}
static void inner_add_yblock_bw_16_obmc_32_sse2 ( uint8_t * obmc , const long obmc_stride , uint8_t * * block , int b_w , long b_h ,
int src_x , int src_y , long src_stride , slice_buffer * sb , int add , uint8_t * dst8 ) {
snow_inner_add_yblock_sse2_header
snow_inner_add_yblock_sse2_start_16 ( " xmm1 " , " xmm5 " , " 3 " , " 0 " )
snow_inner_add_yblock_sse2_accum_16 ( " 2 " , " 16 " )
snow_inner_add_yblock_sse2_accum_16 ( " 1 " , " 512 " )
snow_inner_add_yblock_sse2_accum_16 ( " 0 " , " 528 " )
" mov %0, %% " REG_d " \n \t "
" movdqa %%xmm1, %%xmm0 \n \t "
" movdqa %%xmm5, %%xmm4 \n \t "
" punpcklwd %%xmm7, %%xmm0 \n \t "
" paddd (%% " REG_D " ), %%xmm0 \n \t "
" punpckhwd %%xmm7, %%xmm1 \n \t "
" paddd 16(%% " REG_D " ), %%xmm1 \n \t "
" punpcklwd %%xmm7, %%xmm4 \n \t "
" paddd 32(%% " REG_D " ), %%xmm4 \n \t "
" punpckhwd %%xmm7, %%xmm5 \n \t "
" paddd 48(%% " REG_D " ), %%xmm5 \n \t "
" paddd %%xmm3, %%xmm0 \n \t "
" paddd %%xmm3, %%xmm1 \n \t "
" paddd %%xmm3, %%xmm4 \n \t "
" paddd %%xmm3, %%xmm5 \n \t "
" psrad $8, %%xmm0 \n \t " /* FRAC_BITS. */
" psrad $8, %%xmm1 \n \t " /* FRAC_BITS. */
" psrad $8, %%xmm4 \n \t " /* FRAC_BITS. */
" psrad $8, %%xmm5 \n \t " /* FRAC_BITS. */
" packssdw %%xmm1, %%xmm0 \n \t "
" packssdw %%xmm5, %%xmm4 \n \t "
" packuswb %%xmm4, %%xmm0 \n \t "
" movdqu %%xmm0, (%% " REG_d " ) \n \t "
snow_inner_add_yblock_sse2_end_16
}
# define snow_inner_add_yblock_mmx_header \
DWTELEM * * dst_array = sb - > line + src_y ; \
asm volatile ( \
" mov %6, %% " REG_c " \n \t " \
" mov %5, %% " REG_b " \n \t " \
" mov %3, %% " REG_S " \n \t " \
" pxor %%mm7, %%mm7 \n \t " /* 0 */ \
" pcmpeqd %%mm3, %%mm3 \n \t " \
" pslld $31, %%mm3 \n \t " \
" psrld $24, %%mm3 \n \t " /* FRAC_BITS >> 1 */ \
" 1: \n \t " \
" mov %1, %% " REG_D " \n \t " \
" mov (%% " REG_D " ), %% " REG_D " \n \t " \
" add %2, %% " REG_D " \n \t "
# define snow_inner_add_yblock_mmx_start(out_reg1, out_reg2, ptr_offset, s_offset, d_offset)\
" mov " PTR_SIZE " * " ptr_offset " (%% " REG_a " ), %% " REG_d " ; \n \t " \
" movd " d_offset " (%% " REG_d " ), %% " out_reg1 " \n \t " \
" movd " d_offset " +4(%% " REG_d " ), %% " out_reg2 " \n \t " \
" punpcklbw %%mm7, %% " out_reg1 " \n \t " \
" punpcklbw %%mm7, %% " out_reg2 " \n \t " \
" movd " s_offset " (%% " REG_S " ), %%mm0 \n \t " \
" movd " s_offset " +4(%% " REG_S " ), %%mm4 \n \t " \
" punpcklbw %%mm7, %%mm0 \n \t " \
" punpcklbw %%mm7, %%mm4 \n \t " \
" pmullw %%mm0, %% " out_reg1 " \n \t " \
" pmullw %%mm4, %% " out_reg2 " \n \t "
# define snow_inner_add_yblock_mmx_accum(ptr_offset, s_offset, d_offset) \
snow_inner_add_yblock_mmx_start ( " mm2 " , " mm6 " , ptr_offset , s_offset , d_offset ) \
" paddusw %%mm2, %%mm1 \n \t " \
" paddusw %%mm6, %%mm5 \n \t "
# define snow_inner_add_yblock_mmx_mix(read_offset, write_offset)\
" mov %0, %% " REG_d " \n \t " \
" movq %%mm1, %%mm0 \n \t " \
" movq %%mm5, %%mm4 \n \t " \
" punpcklwd %%mm7, %%mm0 \n \t " \
" paddd " read_offset " (%% " REG_D " ), %%mm0 \n \t " \
" punpckhwd %%mm7, %%mm1 \n \t " \
" paddd " read_offset " +8(%% " REG_D " ), %%mm1 \n \t " \
" punpcklwd %%mm7, %%mm4 \n \t " \
" paddd " read_offset " +16(%% " REG_D " ), %%mm4 \n \t " \
" punpckhwd %%mm7, %%mm5 \n \t " \
" paddd " read_offset " +24(%% " REG_D " ), %%mm5 \n \t " \
" paddd %%mm3, %%mm0 \n \t " \
" paddd %%mm3, %%mm1 \n \t " \
" paddd %%mm3, %%mm4 \n \t " \
" paddd %%mm3, %%mm5 \n \t " \
" psrad $8, %%mm0 \n \t " \
" psrad $8, %%mm1 \n \t " \
" psrad $8, %%mm4 \n \t " \
" psrad $8, %%mm5 \n \t " \
\
" packssdw %%mm1, %%mm0 \n \t " \
" packssdw %%mm5, %%mm4 \n \t " \
" packuswb %%mm4, %%mm0 \n \t " \
" movq %%mm0, " write_offset " (%% " REG_d " ) \n \t "
# define snow_inner_add_yblock_mmx_end(s_step)\
" add $ " s_step " , %% " REG_S " \n \t " \
" add %% " REG_c " , " PTR_SIZE " *3(%% " REG_a " ); \n \t " \
" add %% " REG_c " , " PTR_SIZE " *2(%% " REG_a " ); \n \t " \
" add %% " REG_c " , " PTR_SIZE " *1(%% " REG_a " ); \n \t " \
" add %% " REG_c " , (%% " REG_a " ) \n \t " \
" add $ " PTR_SIZE " *1, %1 \n \t " \
" add %% " REG_c " , %0 \n \t " \
" dec %% " REG_b " \n \t " \
" jnz 1b \n \t " \
: " +m " ( dst8 ) , " +m " ( dst_array ) \
: \
" rm " ( ( long ) ( src_x < < 2 ) ) , " m " ( obmc ) , " a " ( block ) , " m " ( ( long ) b_h ) , " rm " ( ( long ) src_stride ) : \
" % " REG_b " " , " % " REG_c " " , " % " REG_S " " , " % " REG_D " " , " % " REG_d " " ) ;
static void inner_add_yblock_bw_8_obmc_16_mmx ( uint8_t * obmc , const long obmc_stride , uint8_t * * block , int b_w , long b_h ,
int src_x , int src_y , long src_stride , slice_buffer * sb , int add , uint8_t * dst8 ) {
snow_inner_add_yblock_mmx_header
snow_inner_add_yblock_mmx_start ( " mm1 " , " mm5 " , " 3 " , " 0 " , " 0 " )
snow_inner_add_yblock_mmx_accum ( " 2 " , " 8 " , " 0 " )
snow_inner_add_yblock_mmx_accum ( " 1 " , " 128 " , " 0 " )
snow_inner_add_yblock_mmx_accum ( " 0 " , " 136 " , " 0 " )
snow_inner_add_yblock_mmx_mix ( " 0 " , " 0 " )
snow_inner_add_yblock_mmx_end ( " 16 " )
}
static void inner_add_yblock_bw_16_obmc_32_mmx ( uint8_t * obmc , const long obmc_stride , uint8_t * * block , int b_w , long b_h ,
int src_x , int src_y , long src_stride , slice_buffer * sb , int add , uint8_t * dst8 ) {
snow_inner_add_yblock_mmx_header
snow_inner_add_yblock_mmx_start ( " mm1 " , " mm5 " , " 3 " , " 0 " , " 0 " )
snow_inner_add_yblock_mmx_accum ( " 2 " , " 16 " , " 0 " )
snow_inner_add_yblock_mmx_accum ( " 1 " , " 512 " , " 0 " )
snow_inner_add_yblock_mmx_accum ( " 0 " , " 528 " , " 0 " )
snow_inner_add_yblock_mmx_mix ( " 0 " , " 0 " )
snow_inner_add_yblock_mmx_start ( " mm1 " , " mm5 " , " 3 " , " 8 " , " 8 " )
snow_inner_add_yblock_mmx_accum ( " 2 " , " 24 " , " 8 " )
snow_inner_add_yblock_mmx_accum ( " 1 " , " 520 " , " 8 " )
snow_inner_add_yblock_mmx_accum ( " 0 " , " 536 " , " 8 " )
snow_inner_add_yblock_mmx_mix ( " 32 " , " 8 " )
snow_inner_add_yblock_mmx_end ( " 32 " )
}
void ff_snow_inner_add_yblock_sse2 ( uint8_t * obmc , const int obmc_stride , uint8_t * * block , int b_w , int b_h ,
int src_x , int src_y , int src_stride , slice_buffer * sb , int add , uint8_t * dst8 ) {
if ( b_w = = 16 )
inner_add_yblock_bw_16_obmc_32_sse2 ( obmc , obmc_stride , block , b_w , b_h , src_x , src_y , src_stride , sb , add , dst8 ) ;
else if ( b_w = = 8 & & obmc_stride = = 16 ) {
if ( ! ( b_h & 1 ) )
inner_add_yblock_bw_8_obmc_16_bh_even_sse2 ( obmc , obmc_stride , block , b_w , b_h , src_x , src_y , src_stride , sb , add , dst8 ) ;
else
inner_add_yblock_bw_8_obmc_16_mmx ( obmc , obmc_stride , block , b_w , b_h , src_x , src_y , src_stride , sb , add , dst8 ) ;
} else
ff_snow_inner_add_yblock ( obmc , obmc_stride , block , b_w , b_h , src_x , src_y , src_stride , sb , add , dst8 ) ;
}
void ff_snow_inner_add_yblock_mmx ( uint8_t * obmc , const int obmc_stride , uint8_t * * block , int b_w , int b_h ,
int src_x , int src_y , int src_stride , slice_buffer * sb , int add , uint8_t * dst8 ) {
if ( b_w = = 16 )
inner_add_yblock_bw_16_obmc_32_mmx ( obmc , obmc_stride , block , b_w , b_h , src_x , src_y , src_stride , sb , add , dst8 ) ;
else if ( b_w = = 8 & & obmc_stride = = 16 )
inner_add_yblock_bw_8_obmc_16_mmx ( obmc , obmc_stride , block , b_w , b_h , src_x , src_y , src_stride , sb , add , dst8 ) ;
else
ff_snow_inner_add_yblock ( obmc , obmc_stride , block , b_w , b_h , src_x , src_y , src_stride , sb , add , dst8 ) ;
}