|
|
|
@ -22,6 +22,7 @@ |
|
|
|
|
* MMX optimization by Nick Kurshev <nickols_k@mail.ru> |
|
|
|
|
*/ |
|
|
|
|
|
|
|
|
|
#include "libavutil/cpu.h" |
|
|
|
|
#include "libavutil/x86_cpu.h" |
|
|
|
|
#include "libavcodec/dsputil.h" |
|
|
|
|
#include "libavcodec/h264dsp.h" |
|
|
|
@ -2525,7 +2526,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) |
|
|
|
|
int mm_flags = mm_support(); |
|
|
|
|
|
|
|
|
|
if (avctx->dsp_mask) { |
|
|
|
|
if (avctx->dsp_mask & FF_MM_FORCE) |
|
|
|
|
if (avctx->dsp_mask & AV_CPU_FLAG_FORCE) |
|
|
|
|
mm_flags |= (avctx->dsp_mask & 0xffff); |
|
|
|
|
else |
|
|
|
|
mm_flags &= ~(avctx->dsp_mask & 0xffff); |
|
|
|
@ -2533,20 +2534,20 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) |
|
|
|
|
|
|
|
|
|
#if 0 |
|
|
|
|
av_log(avctx, AV_LOG_INFO, "libavcodec: CPU flags:"); |
|
|
|
|
if (mm_flags & FF_MM_MMX) |
|
|
|
|
if (mm_flags & AV_CPU_FLAG_MMX) |
|
|
|
|
av_log(avctx, AV_LOG_INFO, " mmx"); |
|
|
|
|
if (mm_flags & FF_MM_MMX2) |
|
|
|
|
if (mm_flags & AV_CPU_FLAG_MMX2) |
|
|
|
|
av_log(avctx, AV_LOG_INFO, " mmx2"); |
|
|
|
|
if (mm_flags & FF_MM_3DNOW) |
|
|
|
|
if (mm_flags & AV_CPU_FLAG_3DNOW) |
|
|
|
|
av_log(avctx, AV_LOG_INFO, " 3dnow"); |
|
|
|
|
if (mm_flags & FF_MM_SSE) |
|
|
|
|
if (mm_flags & AV_CPU_FLAG_SSE) |
|
|
|
|
av_log(avctx, AV_LOG_INFO, " sse"); |
|
|
|
|
if (mm_flags & FF_MM_SSE2) |
|
|
|
|
if (mm_flags & AV_CPU_FLAG_SSE2) |
|
|
|
|
av_log(avctx, AV_LOG_INFO, " sse2"); |
|
|
|
|
av_log(avctx, AV_LOG_INFO, "\n"); |
|
|
|
|
#endif |
|
|
|
|
|
|
|
|
|
if (mm_flags & FF_MM_MMX) { |
|
|
|
|
if (mm_flags & AV_CPU_FLAG_MMX) { |
|
|
|
|
const int idct_algo= avctx->idct_algo; |
|
|
|
|
|
|
|
|
|
if(avctx->lowres==0){ |
|
|
|
@ -2557,7 +2558,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) |
|
|
|
|
c->idct_permutation_type= FF_SIMPLE_IDCT_PERM; |
|
|
|
|
#if CONFIG_GPL |
|
|
|
|
}else if(idct_algo==FF_IDCT_LIBMPEG2MMX){ |
|
|
|
|
if(mm_flags & FF_MM_MMX2){ |
|
|
|
|
if(mm_flags & AV_CPU_FLAG_MMX2){ |
|
|
|
|
c->idct_put= ff_libmpeg2mmx2_idct_put; |
|
|
|
|
c->idct_add= ff_libmpeg2mmx2_idct_add; |
|
|
|
|
c->idct = ff_mmxext_idct; |
|
|
|
@ -2570,7 +2571,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) |
|
|
|
|
#endif |
|
|
|
|
}else if((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER || CONFIG_VP6_DECODER) && |
|
|
|
|
idct_algo==FF_IDCT_VP3 && HAVE_YASM){ |
|
|
|
|
if(mm_flags & FF_MM_SSE2){ |
|
|
|
|
if(mm_flags & AV_CPU_FLAG_SSE2){ |
|
|
|
|
c->idct_put= ff_vp3_idct_put_sse2; |
|
|
|
|
c->idct_add= ff_vp3_idct_add_sse2; |
|
|
|
|
c->idct = ff_vp3_idct_sse2; |
|
|
|
@ -2584,12 +2585,12 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) |
|
|
|
|
}else if(idct_algo==FF_IDCT_CAVS){ |
|
|
|
|
c->idct_permutation_type= FF_TRANSPOSE_IDCT_PERM; |
|
|
|
|
}else if(idct_algo==FF_IDCT_XVIDMMX){ |
|
|
|
|
if(mm_flags & FF_MM_SSE2){ |
|
|
|
|
if(mm_flags & AV_CPU_FLAG_SSE2){ |
|
|
|
|
c->idct_put= ff_idct_xvid_sse2_put; |
|
|
|
|
c->idct_add= ff_idct_xvid_sse2_add; |
|
|
|
|
c->idct = ff_idct_xvid_sse2; |
|
|
|
|
c->idct_permutation_type= FF_SSE2_IDCT_PERM; |
|
|
|
|
}else if(mm_flags & FF_MM_MMX2){ |
|
|
|
|
}else if(mm_flags & AV_CPU_FLAG_MMX2){ |
|
|
|
|
c->idct_put= ff_idct_xvid_mmx2_put; |
|
|
|
|
c->idct_add= ff_idct_xvid_mmx2_add; |
|
|
|
|
c->idct = ff_idct_xvid_mmx2; |
|
|
|
@ -2606,7 +2607,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) |
|
|
|
|
c->add_pixels_clamped = ff_add_pixels_clamped_mmx; |
|
|
|
|
c->clear_block = clear_block_mmx; |
|
|
|
|
c->clear_blocks = clear_blocks_mmx; |
|
|
|
|
if ((mm_flags & FF_MM_SSE) && |
|
|
|
|
if ((mm_flags & AV_CPU_FLAG_SSE) && |
|
|
|
|
!(CONFIG_MPEG_XVMC_DECODER && avctx->xvmc_acceleration > 1)){ |
|
|
|
|
/* XvMCCreateBlocks() may not allocate 16-byte aligned blocks */ |
|
|
|
|
c->clear_block = clear_block_sse; |
|
|
|
@ -2649,7 +2650,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) |
|
|
|
|
c->put_rv40_chroma_pixels_tab[1]= ff_put_rv40_chroma_mc4_mmx; |
|
|
|
|
#endif |
|
|
|
|
|
|
|
|
|
if (mm_flags & FF_MM_MMX2) { |
|
|
|
|
if (mm_flags & AV_CPU_FLAG_MMX2) { |
|
|
|
|
c->prefetch = prefetch_mmx2; |
|
|
|
|
|
|
|
|
|
c->put_pixels_tab[0][1] = put_pixels16_x2_mmx2; |
|
|
|
@ -2740,7 +2741,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) |
|
|
|
|
c->add_hfyu_median_prediction = ff_add_hfyu_median_prediction_mmx2; |
|
|
|
|
#endif |
|
|
|
|
#if HAVE_7REGS && HAVE_TEN_OPERANDS |
|
|
|
|
if( mm_flags&FF_MM_3DNOW ) |
|
|
|
|
if( mm_flags&AV_CPU_FLAG_3DNOW ) |
|
|
|
|
c->add_hfyu_median_prediction = add_hfyu_median_prediction_cmov; |
|
|
|
|
#endif |
|
|
|
|
|
|
|
|
@ -2748,7 +2749,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) |
|
|
|
|
ff_vc1dsp_init_mmx(c, avctx); |
|
|
|
|
|
|
|
|
|
c->add_png_paeth_prediction= add_png_paeth_prediction_mmx2; |
|
|
|
|
} else if (mm_flags & FF_MM_3DNOW) { |
|
|
|
|
} else if (mm_flags & AV_CPU_FLAG_3DNOW) { |
|
|
|
|
c->prefetch = prefetch_3dnow; |
|
|
|
|
|
|
|
|
|
c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow; |
|
|
|
@ -2816,13 +2817,13 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) |
|
|
|
|
c->put_h264_qpel_pixels_tab[1][x+y*4] = put_h264_qpel8_mc##x##y##_##CPU;\
|
|
|
|
|
c->avg_h264_qpel_pixels_tab[0][x+y*4] = avg_h264_qpel16_mc##x##y##_##CPU;\
|
|
|
|
|
c->avg_h264_qpel_pixels_tab[1][x+y*4] = avg_h264_qpel8_mc##x##y##_##CPU; |
|
|
|
|
if((mm_flags & FF_MM_SSE2) && !(mm_flags & FF_MM_3DNOW)){ |
|
|
|
|
if((mm_flags & AV_CPU_FLAG_SSE2) && !(mm_flags & AV_CPU_FLAG_3DNOW)){ |
|
|
|
|
// these functions are slower than mmx on AMD, but faster on Intel
|
|
|
|
|
c->put_pixels_tab[0][0] = put_pixels16_sse2; |
|
|
|
|
c->avg_pixels_tab[0][0] = avg_pixels16_sse2; |
|
|
|
|
H264_QPEL_FUNCS(0, 0, sse2); |
|
|
|
|
} |
|
|
|
|
if(mm_flags & FF_MM_SSE2){ |
|
|
|
|
if(mm_flags & AV_CPU_FLAG_SSE2){ |
|
|
|
|
H264_QPEL_FUNCS(0, 1, sse2); |
|
|
|
|
H264_QPEL_FUNCS(0, 2, sse2); |
|
|
|
|
H264_QPEL_FUNCS(0, 3, sse2); |
|
|
|
@ -2837,7 +2838,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) |
|
|
|
|
H264_QPEL_FUNCS(3, 3, sse2); |
|
|
|
|
} |
|
|
|
|
#if HAVE_SSSE3 |
|
|
|
|
if(mm_flags & FF_MM_SSSE3){ |
|
|
|
|
if(mm_flags & AV_CPU_FLAG_SSSE3){ |
|
|
|
|
H264_QPEL_FUNCS(1, 0, ssse3); |
|
|
|
|
H264_QPEL_FUNCS(1, 1, ssse3); |
|
|
|
|
H264_QPEL_FUNCS(1, 2, ssse3); |
|
|
|
@ -2859,13 +2860,13 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) |
|
|
|
|
c->put_h264_chroma_pixels_tab[1]= ff_put_h264_chroma_mc4_ssse3; |
|
|
|
|
c->avg_h264_chroma_pixels_tab[1]= ff_avg_h264_chroma_mc4_ssse3; |
|
|
|
|
c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_ssse3; |
|
|
|
|
if (mm_flags & FF_MM_SSE4) // not really sse4, just slow on Conroe
|
|
|
|
|
if (mm_flags & AV_CPU_FLAG_SSE4) // not really sse4, just slow on Conroe
|
|
|
|
|
c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_sse4; |
|
|
|
|
#endif |
|
|
|
|
} |
|
|
|
|
#endif |
|
|
|
|
|
|
|
|
|
if(mm_flags & FF_MM_3DNOW){ |
|
|
|
|
if(mm_flags & AV_CPU_FLAG_3DNOW){ |
|
|
|
|
c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow; |
|
|
|
|
c->vector_fmul = vector_fmul_3dnow; |
|
|
|
|
if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ |
|
|
|
@ -2873,20 +2874,20 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) |
|
|
|
|
c->float_to_int16_interleave = float_to_int16_interleave_3dnow; |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
if(mm_flags & FF_MM_3DNOWEXT){ |
|
|
|
|
if(mm_flags & AV_CPU_FLAG_3DNOWEXT){ |
|
|
|
|
c->vector_fmul_reverse = vector_fmul_reverse_3dnow2; |
|
|
|
|
c->vector_fmul_window = vector_fmul_window_3dnow2; |
|
|
|
|
if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ |
|
|
|
|
c->float_to_int16_interleave = float_to_int16_interleave_3dn2; |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
if(mm_flags & FF_MM_MMX2){ |
|
|
|
|
if(mm_flags & AV_CPU_FLAG_MMX2){ |
|
|
|
|
#if HAVE_YASM |
|
|
|
|
c->scalarproduct_int16 = ff_scalarproduct_int16_mmx2; |
|
|
|
|
c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmx2; |
|
|
|
|
#endif |
|
|
|
|
} |
|
|
|
|
if(mm_flags & FF_MM_SSE){ |
|
|
|
|
if(mm_flags & AV_CPU_FLAG_SSE){ |
|
|
|
|
c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse; |
|
|
|
|
c->ac3_downmix = ac3_downmix_sse; |
|
|
|
|
c->vector_fmul = vector_fmul_sse; |
|
|
|
@ -2901,9 +2902,9 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) |
|
|
|
|
c->scalarproduct_float = ff_scalarproduct_float_sse; |
|
|
|
|
#endif |
|
|
|
|
} |
|
|
|
|
if(mm_flags & FF_MM_3DNOW) |
|
|
|
|
if(mm_flags & AV_CPU_FLAG_3DNOW) |
|
|
|
|
c->vector_fmul_add = vector_fmul_add_3dnow; // faster than sse
|
|
|
|
|
if(mm_flags & FF_MM_SSE2){ |
|
|
|
|
if(mm_flags & AV_CPU_FLAG_SSE2){ |
|
|
|
|
c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse2; |
|
|
|
|
c->float_to_int16 = float_to_int16_sse2; |
|
|
|
|
c->float_to_int16_interleave = float_to_int16_interleave_sse2; |
|
|
|
@ -2912,7 +2913,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) |
|
|
|
|
c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2; |
|
|
|
|
#endif |
|
|
|
|
} |
|
|
|
|
if((mm_flags & FF_MM_SSSE3) && !(mm_flags & (FF_MM_SSE42|FF_MM_3DNOW)) && HAVE_YASM) // cachesplit
|
|
|
|
|
if((mm_flags & AV_CPU_FLAG_SSSE3) && !(mm_flags & (AV_CPU_FLAG_SSE42|AV_CPU_FLAG_3DNOW)) && HAVE_YASM) // cachesplit
|
|
|
|
|
c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|