avcodec/x86/me_cmp: Remove obsolete MMX(EXT) functions

x64 always has MMX, MMXEXT, SSE and SSE2 and this means
that some functions for MMX, MMXEXT and 3dnow are always
overridden by other functions (unless one e.g. explicitly
disables SSE2) for x64. So given that the only systems that
benefit from these functions are truely ancient 32bit x86s
they are removed.

Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
release/5.1
Andreas Rheinhardt 2 years ago
parent c5dd2fdc09
commit 542765ce3e
  1. 9
      libavcodec/x86/me_cmp.asm
  2. 349
      libavcodec/x86/me_cmp_init.c

@ -261,11 +261,10 @@ hadamard8_16_wrapper 0, 14
%endif
%endmacro
INIT_MMX mmx
HADAMARD8_DIFF
%if HAVE_ALIGNED_STACK == 0
INIT_MMX mmxext
HADAMARD8_DIFF
%endif
INIT_XMM sse2
%if ARCH_X86_64
@ -385,10 +384,6 @@ cglobal sum_abs_dctelem, 1, 1, %1, block
RET
%endmacro
INIT_MMX mmx
SUM_ABS_DCTELEM 0, 4
INIT_MMX mmxext
SUM_ABS_DCTELEM 0, 4
INIT_XMM sse2
SUM_ABS_DCTELEM 7, 2
INIT_XMM ssse3

@ -30,8 +30,6 @@
#include "libavcodec/me_cmp.h"
#include "libavcodec/mpegvideo.h"
int ff_sum_abs_dctelem_mmx(int16_t *block);
int ff_sum_abs_dctelem_mmxext(int16_t *block);
int ff_sum_abs_dctelem_sse2(int16_t *block);
int ff_sum_abs_dctelem_ssse3(int16_t *block);
int ff_sse8_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
@ -85,7 +83,6 @@ int ff_vsad16_approx_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
int ff_hadamard8_diff16_ ## cpu(MpegEncContext *s, uint8_t *src1, \
uint8_t *src2, ptrdiff_t stride, int h);
hadamard_func(mmx)
hadamard_func(mmxext)
hadamard_func(sse2)
hadamard_func(ssse3)
@ -126,232 +123,12 @@ static int nsse8_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2,
#if HAVE_INLINE_ASM
static int vsad_intra16_mmx(MpegEncContext *v, uint8_t *pix, uint8_t *dummy,
ptrdiff_t stride, int h)
{
int tmp;
av_assert2(((uintptr_t) pix & 7) == 0);
av_assert2((stride & 7) == 0);
#define SUM(in0, in1, out0, out1) \
"movq (%0), %%mm2\n" \
"movq 8(%0), %%mm3\n" \
"add %2,%0\n" \
"movq %%mm2, " #out0 "\n" \
"movq %%mm3, " #out1 "\n" \
"psubusb " #in0 ", %%mm2\n" \
"psubusb " #in1 ", %%mm3\n" \
"psubusb " #out0 ", " #in0 "\n" \
"psubusb " #out1 ", " #in1 "\n" \
"por %%mm2, " #in0 "\n" \
"por %%mm3, " #in1 "\n" \
"movq " #in0 ", %%mm2\n" \
"movq " #in1 ", %%mm3\n" \
"punpcklbw %%mm7, " #in0 "\n" \
"punpcklbw %%mm7, " #in1 "\n" \
"punpckhbw %%mm7, %%mm2\n" \
"punpckhbw %%mm7, %%mm3\n" \
"paddw " #in1 ", " #in0 "\n" \
"paddw %%mm3, %%mm2\n" \
"paddw %%mm2, " #in0 "\n" \
"paddw " #in0 ", %%mm6\n"
__asm__ volatile (
"movl %3, %%ecx\n"
"pxor %%mm6, %%mm6\n"
"pxor %%mm7, %%mm7\n"
"movq (%0), %%mm0\n"
"movq 8(%0), %%mm1\n"
"add %2, %0\n"
"jmp 2f\n"
"1:\n"
SUM(%%mm4, %%mm5, %%mm0, %%mm1)
"2:\n"
SUM(%%mm0, %%mm1, %%mm4, %%mm5)
"subl $2, %%ecx\n"
"jnz 1b\n"
"movq %%mm6, %%mm0\n"
"psrlq $32, %%mm6\n"
"paddw %%mm6, %%mm0\n"
"movq %%mm0, %%mm6\n"
"psrlq $16, %%mm0\n"
"paddw %%mm6, %%mm0\n"
"movd %%mm0, %1\n"
: "+r" (pix), "=r" (tmp)
: "r" (stride), "m" (h)
: "%ecx");
return tmp & 0xFFFF;
}
#undef SUM
static int vsad16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
ptrdiff_t stride, int h)
{
int tmp;
av_assert2(((uintptr_t)pix1 & 7) == 0);
av_assert2(((uintptr_t)pix2 & 7) == 0);
av_assert2((stride & 7) == 0);
#define SUM(in0, in1, out0, out1) \
"movq (%0), %%mm2\n" \
"movq (%1), " #out0 "\n" \
"movq 8(%0), %%mm3\n" \
"movq 8(%1), " #out1 "\n" \
"add %3, %0\n" \
"add %3, %1\n" \
"psubb " #out0 ", %%mm2\n" \
"psubb " #out1 ", %%mm3\n" \
"pxor %%mm7, %%mm2\n" \
"pxor %%mm7, %%mm3\n" \
"movq %%mm2, " #out0 "\n" \
"movq %%mm3, " #out1 "\n" \
"psubusb " #in0 ", %%mm2\n" \
"psubusb " #in1 ", %%mm3\n" \
"psubusb " #out0 ", " #in0 "\n" \
"psubusb " #out1 ", " #in1 "\n" \
"por %%mm2, " #in0 "\n" \
"por %%mm3, " #in1 "\n" \
"movq " #in0 ", %%mm2\n" \
"movq " #in1 ", %%mm3\n" \
"punpcklbw %%mm7, " #in0 "\n" \
"punpcklbw %%mm7, " #in1 "\n" \
"punpckhbw %%mm7, %%mm2\n" \
"punpckhbw %%mm7, %%mm3\n" \
"paddw " #in1 ", " #in0 "\n" \
"paddw %%mm3, %%mm2\n" \
"paddw %%mm2, " #in0 "\n" \
"paddw " #in0 ", %%mm6\n"
__asm__ volatile (
"movl %4, %%ecx\n"
"pxor %%mm6, %%mm6\n"
"pcmpeqw %%mm7, %%mm7\n"
"psllw $15, %%mm7\n"
"packsswb %%mm7, %%mm7\n"
"movq (%0), %%mm0\n"
"movq (%1), %%mm2\n"
"movq 8(%0), %%mm1\n"
"movq 8(%1), %%mm3\n"
"add %3, %0\n"
"add %3, %1\n"
"psubb %%mm2, %%mm0\n"
"psubb %%mm3, %%mm1\n"
"pxor %%mm7, %%mm0\n"
"pxor %%mm7, %%mm1\n"
"jmp 2f\n"
"1:\n"
SUM(%%mm4, %%mm5, %%mm0, %%mm1)
"2:\n"
SUM(%%mm0, %%mm1, %%mm4, %%mm5)
"subl $2, %%ecx\n"
"jnz 1b\n"
"movq %%mm6, %%mm0\n"
"psrlq $32, %%mm6\n"
"paddw %%mm6, %%mm0\n"
"movq %%mm0, %%mm6\n"
"psrlq $16, %%mm0\n"
"paddw %%mm6, %%mm0\n"
"movd %%mm0, %2\n"
: "+r" (pix1), "+r" (pix2), "=r" (tmp)
: "r" (stride), "m" (h)
: "%ecx");
return tmp & 0x7FFF;
}
#undef SUM
DECLARE_ASM_CONST(8, uint64_t, round_tab)[3] = {
0x0000000000000000ULL,
0x0001000100010001ULL,
0x0002000200020002ULL,
};
static inline void sad8_1_mmx(uint8_t *blk1, uint8_t *blk2,
ptrdiff_t stride, int h)
{
x86_reg len = -stride * h;
__asm__ volatile (
".p2align 4 \n\t"
"1: \n\t"
"movq (%1, %%"FF_REG_a"), %%mm0 \n\t"
"movq (%2, %%"FF_REG_a"), %%mm2 \n\t"
"movq (%2, %%"FF_REG_a"), %%mm4 \n\t"
"add %3, %%"FF_REG_a" \n\t"
"psubusb %%mm0, %%mm2 \n\t"
"psubusb %%mm4, %%mm0 \n\t"
"movq (%1, %%"FF_REG_a"), %%mm1 \n\t"
"movq (%2, %%"FF_REG_a"), %%mm3 \n\t"
"movq (%2, %%"FF_REG_a"), %%mm5 \n\t"
"psubusb %%mm1, %%mm3 \n\t"
"psubusb %%mm5, %%mm1 \n\t"
"por %%mm2, %%mm0 \n\t"
"por %%mm1, %%mm3 \n\t"
"movq %%mm0, %%mm1 \n\t"
"movq %%mm3, %%mm2 \n\t"
"punpcklbw %%mm7, %%mm0 \n\t"
"punpckhbw %%mm7, %%mm1 \n\t"
"punpcklbw %%mm7, %%mm3 \n\t"
"punpckhbw %%mm7, %%mm2 \n\t"
"paddw %%mm1, %%mm0 \n\t"
"paddw %%mm3, %%mm2 \n\t"
"paddw %%mm2, %%mm0 \n\t"
"paddw %%mm0, %%mm6 \n\t"
"add %3, %%"FF_REG_a" \n\t"
" js 1b \n\t"
: "+a" (len)
: "r" (blk1 - len), "r" (blk2 - len), "r" (stride));
}
static inline void sad8_2_mmx(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2,
ptrdiff_t stride, int h)
{
x86_reg len = -stride * h;
__asm__ volatile (
".p2align 4 \n\t"
"1: \n\t"
"movq (%1, %%"FF_REG_a"), %%mm0 \n\t"
"movq (%2, %%"FF_REG_a"), %%mm1 \n\t"
"movq (%1, %%"FF_REG_a"), %%mm2 \n\t"
"movq (%2, %%"FF_REG_a"), %%mm3 \n\t"
"punpcklbw %%mm7, %%mm0 \n\t"
"punpcklbw %%mm7, %%mm1 \n\t"
"punpckhbw %%mm7, %%mm2 \n\t"
"punpckhbw %%mm7, %%mm3 \n\t"
"paddw %%mm0, %%mm1 \n\t"
"paddw %%mm2, %%mm3 \n\t"
"movq (%3, %%"FF_REG_a"), %%mm4 \n\t"
"movq (%3, %%"FF_REG_a"), %%mm2 \n\t"
"paddw %%mm5, %%mm1 \n\t"
"paddw %%mm5, %%mm3 \n\t"
"psrlw $1, %%mm1 \n\t"
"psrlw $1, %%mm3 \n\t"
"packuswb %%mm3, %%mm1 \n\t"
"psubusb %%mm1, %%mm4 \n\t"
"psubusb %%mm2, %%mm1 \n\t"
"por %%mm4, %%mm1 \n\t"
"movq %%mm1, %%mm0 \n\t"
"punpcklbw %%mm7, %%mm0 \n\t"
"punpckhbw %%mm7, %%mm1 \n\t"
"paddw %%mm1, %%mm0 \n\t"
"paddw %%mm0, %%mm6 \n\t"
"add %4, %%"FF_REG_a" \n\t"
" js 1b \n\t"
: "+a" (len)
: "r" (blk1a - len), "r" (blk1b - len), "r" (blk2 - len),
"r" (stride));
}
static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2,
ptrdiff_t stride, int h)
{
@ -421,63 +198,7 @@ static inline int sum_mmx(void)
return ret & 0xFFFF;
}
static inline void sad8_x2a_mmx(uint8_t *blk1, uint8_t *blk2,
ptrdiff_t stride, int h)
{
sad8_2_mmx(blk1, blk1 + 1, blk2, stride, h);
}
static inline void sad8_y2a_mmx(uint8_t *blk1, uint8_t *blk2,
ptrdiff_t stride, int h)
{
sad8_2_mmx(blk1, blk1 + stride, blk2, stride, h);
}
#define PIX_SAD(suf) \
static int sad8_ ## suf(MpegEncContext *v, uint8_t *blk2, \
uint8_t *blk1, ptrdiff_t stride, int h) \
{ \
av_assert2(h == 8); \
__asm__ volatile ( \
"pxor %%mm7, %%mm7 \n\t" \
"pxor %%mm6, %%mm6 \n\t" \
:); \
\
sad8_1_ ## suf(blk1, blk2, stride, 8); \
\
return sum_ ## suf(); \
} \
\
static int sad8_x2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
uint8_t *blk1, ptrdiff_t stride, int h) \
{ \
av_assert2(h == 8); \
__asm__ volatile ( \
"pxor %%mm7, %%mm7 \n\t" \
"pxor %%mm6, %%mm6 \n\t" \
"movq %0, %%mm5 \n\t" \
:: "m" (round_tab[1])); \
\
sad8_x2a_ ## suf(blk1, blk2, stride, 8); \
\
return sum_ ## suf(); \
} \
\
static int sad8_y2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
uint8_t *blk1, ptrdiff_t stride, int h) \
{ \
av_assert2(h == 8); \
__asm__ volatile ( \
"pxor %%mm7, %%mm7 \n\t" \
"pxor %%mm6, %%mm6 \n\t" \
"movq %0, %%mm5 \n\t" \
:: "m" (round_tab[1])); \
\
sad8_y2a_ ## suf(blk1, blk2, stride, 8); \
\
return sum_ ## suf(); \
} \
\
#define PIX_SADXY(suf) \
static int sad8_xy2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
uint8_t *blk1, ptrdiff_t stride, int h) \
{ \
@ -492,50 +213,6 @@ static int sad8_xy2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
return sum_ ## suf(); \
} \
\
static int sad16_ ## suf(MpegEncContext *v, uint8_t *blk2, \
uint8_t *blk1, ptrdiff_t stride, int h) \
{ \
__asm__ volatile ( \
"pxor %%mm7, %%mm7 \n\t" \
"pxor %%mm6, %%mm6 \n\t" \
:); \
\
sad8_1_ ## suf(blk1, blk2, stride, h); \
sad8_1_ ## suf(blk1 + 8, blk2 + 8, stride, h); \
\
return sum_ ## suf(); \
} \
\
static int sad16_x2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
uint8_t *blk1, ptrdiff_t stride, int h) \
{ \
__asm__ volatile ( \
"pxor %%mm7, %%mm7 \n\t" \
"pxor %%mm6, %%mm6 \n\t" \
"movq %0, %%mm5 \n\t" \
:: "m" (round_tab[1])); \
\
sad8_x2a_ ## suf(blk1, blk2, stride, h); \
sad8_x2a_ ## suf(blk1 + 8, blk2 + 8, stride, h); \
\
return sum_ ## suf(); \
} \
\
static int sad16_y2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
uint8_t *blk1, ptrdiff_t stride, int h) \
{ \
__asm__ volatile ( \
"pxor %%mm7, %%mm7 \n\t" \
"pxor %%mm6, %%mm6 \n\t" \
"movq %0, %%mm5 \n\t" \
:: "m" (round_tab[1])); \
\
sad8_y2a_ ## suf(blk1, blk2, stride, h); \
sad8_y2a_ ## suf(blk1 + 8, blk2 + 8, stride, h); \
\
return sum_ ## suf(); \
} \
\
static int sad16_xy2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
uint8_t *blk1, ptrdiff_t stride, int h) \
{ \
@ -550,7 +227,7 @@ static int sad16_xy2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
return sum_ ## suf(); \
} \
PIX_SAD(mmx)
PIX_SADXY(mmx)
#endif /* HAVE_INLINE_ASM */
@ -560,32 +237,13 @@ av_cold void ff_me_cmp_init_x86(MECmpContext *c, AVCodecContext *avctx)
#if HAVE_INLINE_ASM
if (INLINE_MMX(cpu_flags)) {
c->pix_abs[0][0] = sad16_mmx;
c->pix_abs[0][1] = sad16_x2_mmx;
c->pix_abs[0][2] = sad16_y2_mmx;
c->pix_abs[0][3] = sad16_xy2_mmx;
c->pix_abs[1][0] = sad8_mmx;
c->pix_abs[1][1] = sad8_x2_mmx;
c->pix_abs[1][2] = sad8_y2_mmx;
c->pix_abs[1][3] = sad8_xy2_mmx;
c->sad[0] = sad16_mmx;
c->sad[1] = sad8_mmx;
c->vsad[4] = vsad_intra16_mmx;
if (!(avctx->flags & AV_CODEC_FLAG_BITEXACT)) {
c->vsad[0] = vsad16_mmx;
}
}
#endif /* HAVE_INLINE_ASM */
if (EXTERNAL_MMX(cpu_flags)) {
c->hadamard8_diff[0] = ff_hadamard8_diff16_mmx;
c->hadamard8_diff[1] = ff_hadamard8_diff_mmx;
c->sum_abs_dctelem = ff_sum_abs_dctelem_mmx;
c->sse[0] = ff_sse16_mmx;
c->sse[1] = ff_sse8_mmx;
#if HAVE_X86ASM
c->nsse[0] = nsse16_mmx;
@ -594,9 +252,10 @@ av_cold void ff_me_cmp_init_x86(MECmpContext *c, AVCodecContext *avctx)
}
if (EXTERNAL_MMXEXT(cpu_flags)) {
#if !HAVE_ALIGNED_STACK
c->hadamard8_diff[0] = ff_hadamard8_diff16_mmxext;
c->hadamard8_diff[1] = ff_hadamard8_diff_mmxext;
c->sum_abs_dctelem = ff_sum_abs_dctelem_mmxext;
#endif
c->sad[0] = ff_sad16_mmxext;
c->sad[1] = ff_sad8_mmxext;

Loading…
Cancel
Save