|
|
|
@ -30,181 +30,14 @@ |
|
|
|
|
#include "libavcodec/mathops.h" |
|
|
|
|
#include "dsputil_mmx.h" |
|
|
|
|
|
|
|
|
|
void ff_get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size); |
|
|
|
|
void ff_get_pixels_sse2(DCTELEM *block, const uint8_t *pixels, int line_size); |
|
|
|
|
void ff_diff_pixels_mmx(DCTELEM *block, const uint8_t *s1, const uint8_t *s2, int stride); |
|
|
|
|
int ff_pix_sum16_mmx(uint8_t * pix, int line_size); |
|
|
|
|
int ff_pix_norm1_mmx(uint8_t *pix, int line_size); |
|
|
|
|
|
|
|
|
|
#if HAVE_INLINE_ASM |
|
|
|
|
|
|
|
|
|
static void get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size) |
|
|
|
|
{ |
|
|
|
|
__asm__ volatile( |
|
|
|
|
"mov $-128, %%"REG_a" \n\t" |
|
|
|
|
"pxor %%mm7, %%mm7 \n\t" |
|
|
|
|
".p2align 4 \n\t" |
|
|
|
|
"1: \n\t" |
|
|
|
|
"movq (%0), %%mm0 \n\t" |
|
|
|
|
"movq (%0, %2), %%mm2 \n\t" |
|
|
|
|
"movq %%mm0, %%mm1 \n\t" |
|
|
|
|
"movq %%mm2, %%mm3 \n\t" |
|
|
|
|
"punpcklbw %%mm7, %%mm0 \n\t" |
|
|
|
|
"punpckhbw %%mm7, %%mm1 \n\t" |
|
|
|
|
"punpcklbw %%mm7, %%mm2 \n\t" |
|
|
|
|
"punpckhbw %%mm7, %%mm3 \n\t" |
|
|
|
|
"movq %%mm0, (%1, %%"REG_a") \n\t" |
|
|
|
|
"movq %%mm1, 8(%1, %%"REG_a") \n\t" |
|
|
|
|
"movq %%mm2, 16(%1, %%"REG_a") \n\t" |
|
|
|
|
"movq %%mm3, 24(%1, %%"REG_a") \n\t" |
|
|
|
|
"add %3, %0 \n\t" |
|
|
|
|
"add $32, %%"REG_a" \n\t" |
|
|
|
|
"js 1b \n\t" |
|
|
|
|
: "+r" (pixels) |
|
|
|
|
: "r" (block+64), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*2) |
|
|
|
|
: "%"REG_a |
|
|
|
|
); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
static void get_pixels_sse2(DCTELEM *block, const uint8_t *pixels, int line_size) |
|
|
|
|
{ |
|
|
|
|
__asm__ volatile( |
|
|
|
|
"pxor %%xmm4, %%xmm4 \n\t" |
|
|
|
|
"movq (%0), %%xmm0 \n\t" |
|
|
|
|
"movq (%0, %2), %%xmm1 \n\t" |
|
|
|
|
"movq (%0, %2,2), %%xmm2 \n\t" |
|
|
|
|
"movq (%0, %3), %%xmm3 \n\t" |
|
|
|
|
"lea (%0,%2,4), %0 \n\t" |
|
|
|
|
"punpcklbw %%xmm4, %%xmm0 \n\t" |
|
|
|
|
"punpcklbw %%xmm4, %%xmm1 \n\t" |
|
|
|
|
"punpcklbw %%xmm4, %%xmm2 \n\t" |
|
|
|
|
"punpcklbw %%xmm4, %%xmm3 \n\t" |
|
|
|
|
"movdqa %%xmm0, (%1) \n\t" |
|
|
|
|
"movdqa %%xmm1, 16(%1) \n\t" |
|
|
|
|
"movdqa %%xmm2, 32(%1) \n\t" |
|
|
|
|
"movdqa %%xmm3, 48(%1) \n\t" |
|
|
|
|
"movq (%0), %%xmm0 \n\t" |
|
|
|
|
"movq (%0, %2), %%xmm1 \n\t" |
|
|
|
|
"movq (%0, %2,2), %%xmm2 \n\t" |
|
|
|
|
"movq (%0, %3), %%xmm3 \n\t" |
|
|
|
|
"punpcklbw %%xmm4, %%xmm0 \n\t" |
|
|
|
|
"punpcklbw %%xmm4, %%xmm1 \n\t" |
|
|
|
|
"punpcklbw %%xmm4, %%xmm2 \n\t" |
|
|
|
|
"punpcklbw %%xmm4, %%xmm3 \n\t" |
|
|
|
|
"movdqa %%xmm0, 64(%1) \n\t" |
|
|
|
|
"movdqa %%xmm1, 80(%1) \n\t" |
|
|
|
|
"movdqa %%xmm2, 96(%1) \n\t" |
|
|
|
|
"movdqa %%xmm3, 112(%1) \n\t" |
|
|
|
|
: "+r" (pixels) |
|
|
|
|
: "r" (block), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*3) |
|
|
|
|
); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
static inline void diff_pixels_mmx(DCTELEM *block, const uint8_t *s1, const uint8_t *s2, int stride) |
|
|
|
|
{ |
|
|
|
|
__asm__ volatile( |
|
|
|
|
"pxor %%mm7, %%mm7 \n\t" |
|
|
|
|
"mov $-128, %%"REG_a" \n\t" |
|
|
|
|
".p2align 4 \n\t" |
|
|
|
|
"1: \n\t" |
|
|
|
|
"movq (%0), %%mm0 \n\t" |
|
|
|
|
"movq (%1), %%mm2 \n\t" |
|
|
|
|
"movq %%mm0, %%mm1 \n\t" |
|
|
|
|
"movq %%mm2, %%mm3 \n\t" |
|
|
|
|
"punpcklbw %%mm7, %%mm0 \n\t" |
|
|
|
|
"punpckhbw %%mm7, %%mm1 \n\t" |
|
|
|
|
"punpcklbw %%mm7, %%mm2 \n\t" |
|
|
|
|
"punpckhbw %%mm7, %%mm3 \n\t" |
|
|
|
|
"psubw %%mm2, %%mm0 \n\t" |
|
|
|
|
"psubw %%mm3, %%mm1 \n\t" |
|
|
|
|
"movq %%mm0, (%2, %%"REG_a") \n\t" |
|
|
|
|
"movq %%mm1, 8(%2, %%"REG_a") \n\t" |
|
|
|
|
"add %3, %0 \n\t" |
|
|
|
|
"add %3, %1 \n\t" |
|
|
|
|
"add $16, %%"REG_a" \n\t" |
|
|
|
|
"jnz 1b \n\t" |
|
|
|
|
: "+r" (s1), "+r" (s2) |
|
|
|
|
: "r" (block+64), "r" ((x86_reg)stride) |
|
|
|
|
: "%"REG_a |
|
|
|
|
); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
static int pix_sum16_mmx(uint8_t * pix, int line_size){ |
|
|
|
|
const int h=16; |
|
|
|
|
int sum; |
|
|
|
|
x86_reg index= -line_size*h; |
|
|
|
|
|
|
|
|
|
__asm__ volatile( |
|
|
|
|
"pxor %%mm7, %%mm7 \n\t" |
|
|
|
|
"pxor %%mm6, %%mm6 \n\t" |
|
|
|
|
"1: \n\t" |
|
|
|
|
"movq (%2, %1), %%mm0 \n\t" |
|
|
|
|
"movq (%2, %1), %%mm1 \n\t" |
|
|
|
|
"movq 8(%2, %1), %%mm2 \n\t" |
|
|
|
|
"movq 8(%2, %1), %%mm3 \n\t" |
|
|
|
|
"punpcklbw %%mm7, %%mm0 \n\t" |
|
|
|
|
"punpckhbw %%mm7, %%mm1 \n\t" |
|
|
|
|
"punpcklbw %%mm7, %%mm2 \n\t" |
|
|
|
|
"punpckhbw %%mm7, %%mm3 \n\t" |
|
|
|
|
"paddw %%mm0, %%mm1 \n\t" |
|
|
|
|
"paddw %%mm2, %%mm3 \n\t" |
|
|
|
|
"paddw %%mm1, %%mm3 \n\t" |
|
|
|
|
"paddw %%mm3, %%mm6 \n\t" |
|
|
|
|
"add %3, %1 \n\t" |
|
|
|
|
" js 1b \n\t" |
|
|
|
|
"movq %%mm6, %%mm5 \n\t" |
|
|
|
|
"psrlq $32, %%mm6 \n\t" |
|
|
|
|
"paddw %%mm5, %%mm6 \n\t" |
|
|
|
|
"movq %%mm6, %%mm5 \n\t" |
|
|
|
|
"psrlq $16, %%mm6 \n\t" |
|
|
|
|
"paddw %%mm5, %%mm6 \n\t" |
|
|
|
|
"movd %%mm6, %0 \n\t" |
|
|
|
|
"andl $0xFFFF, %0 \n\t" |
|
|
|
|
: "=&r" (sum), "+r" (index) |
|
|
|
|
: "r" (pix - index), "r" ((x86_reg)line_size) |
|
|
|
|
); |
|
|
|
|
|
|
|
|
|
return sum; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
static int pix_norm1_mmx(uint8_t *pix, int line_size) { |
|
|
|
|
int tmp; |
|
|
|
|
__asm__ volatile ( |
|
|
|
|
"movl $16,%%ecx\n" |
|
|
|
|
"pxor %%mm0,%%mm0\n" |
|
|
|
|
"pxor %%mm7,%%mm7\n" |
|
|
|
|
"1:\n" |
|
|
|
|
"movq (%0),%%mm2\n" /* mm2 = pix[0-7] */ |
|
|
|
|
"movq 8(%0),%%mm3\n" /* mm3 = pix[8-15] */ |
|
|
|
|
|
|
|
|
|
"movq %%mm2,%%mm1\n" /* mm1 = mm2 = pix[0-7] */ |
|
|
|
|
|
|
|
|
|
"punpckhbw %%mm0,%%mm1\n" /* mm1 = [pix4-7] */ |
|
|
|
|
"punpcklbw %%mm0,%%mm2\n" /* mm2 = [pix0-3] */ |
|
|
|
|
|
|
|
|
|
"movq %%mm3,%%mm4\n" /* mm4 = mm3 = pix[8-15] */ |
|
|
|
|
"punpckhbw %%mm0,%%mm3\n" /* mm3 = [pix12-15] */ |
|
|
|
|
"punpcklbw %%mm0,%%mm4\n" /* mm4 = [pix8-11] */ |
|
|
|
|
|
|
|
|
|
"pmaddwd %%mm1,%%mm1\n" /* mm1 = (pix0^2+pix1^2,pix2^2+pix3^2) */ |
|
|
|
|
"pmaddwd %%mm2,%%mm2\n" /* mm2 = (pix4^2+pix5^2,pix6^2+pix7^2) */ |
|
|
|
|
|
|
|
|
|
"pmaddwd %%mm3,%%mm3\n" |
|
|
|
|
"pmaddwd %%mm4,%%mm4\n" |
|
|
|
|
|
|
|
|
|
"paddd %%mm1,%%mm2\n" /* mm2 = (pix0^2+pix1^2+pix4^2+pix5^2,
|
|
|
|
|
pix2^2+pix3^2+pix6^2+pix7^2) */ |
|
|
|
|
"paddd %%mm3,%%mm4\n" |
|
|
|
|
"paddd %%mm2,%%mm7\n" |
|
|
|
|
|
|
|
|
|
"add %2, %0\n" |
|
|
|
|
"paddd %%mm4,%%mm7\n" |
|
|
|
|
"dec %%ecx\n" |
|
|
|
|
"jnz 1b\n" |
|
|
|
|
|
|
|
|
|
"movq %%mm7,%%mm1\n" |
|
|
|
|
"psrlq $32, %%mm7\n" /* shift hi dword to lo */ |
|
|
|
|
"paddd %%mm7,%%mm1\n" |
|
|
|
|
"movd %%mm1,%1\n" |
|
|
|
|
: "+r" (pix), "=r"(tmp) : "r" ((x86_reg)line_size) : "%ecx" ); |
|
|
|
|
return tmp; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
static int sse8_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { |
|
|
|
|
int tmp; |
|
|
|
|
__asm__ volatile ( |
|
|
|
@ -1112,10 +945,23 @@ hadamard_func(ssse3) |
|
|
|
|
void ff_dsputilenc_init_mmx(DSPContext* c, AVCodecContext *avctx) |
|
|
|
|
{ |
|
|
|
|
int mm_flags = av_get_cpu_flags(); |
|
|
|
|
|
|
|
|
|
#if HAVE_INLINE_ASM |
|
|
|
|
int bit_depth = avctx->bits_per_raw_sample; |
|
|
|
|
|
|
|
|
|
#if HAVE_YASM |
|
|
|
|
if (EXTERNAL_MMX(mm_flags)) { |
|
|
|
|
if (bit_depth <= 8) |
|
|
|
|
c->get_pixels = ff_get_pixels_mmx; |
|
|
|
|
c->diff_pixels = ff_diff_pixels_mmx; |
|
|
|
|
c->pix_sum = ff_pix_sum16_mmx; |
|
|
|
|
|
|
|
|
|
c->pix_norm1 = ff_pix_norm1_mmx; |
|
|
|
|
} |
|
|
|
|
if (EXTERNAL_SSE2(mm_flags)) |
|
|
|
|
if (bit_depth <= 8) |
|
|
|
|
c->get_pixels = ff_get_pixels_sse2; |
|
|
|
|
#endif /* HAVE_YASM */ |
|
|
|
|
|
|
|
|
|
#if HAVE_INLINE_ASM |
|
|
|
|
if (mm_flags & AV_CPU_FLAG_MMX) { |
|
|
|
|
const int dct_algo = avctx->dct_algo; |
|
|
|
|
if (avctx->bits_per_raw_sample <= 8 && |
|
|
|
@ -1129,15 +975,10 @@ void ff_dsputilenc_init_mmx(DSPContext* c, AVCodecContext *avctx) |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
if (bit_depth <= 8) |
|
|
|
|
c->get_pixels = get_pixels_mmx; |
|
|
|
|
c->diff_pixels = diff_pixels_mmx; |
|
|
|
|
c->pix_sum = pix_sum16_mmx; |
|
|
|
|
|
|
|
|
|
c->diff_bytes= diff_bytes_mmx; |
|
|
|
|
c->sum_abs_dctelem= sum_abs_dctelem_mmx; |
|
|
|
|
|
|
|
|
|
c->pix_norm1 = pix_norm1_mmx; |
|
|
|
|
c->sse[0] = sse16_mmx; |
|
|
|
|
c->sse[1] = sse8_mmx; |
|
|
|
|
c->vsad[4]= vsad_intra16_mmx; |
|
|
|
@ -1167,8 +1008,6 @@ void ff_dsputilenc_init_mmx(DSPContext* c, AVCodecContext *avctx) |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
if(mm_flags & AV_CPU_FLAG_SSE2){ |
|
|
|
|
if (bit_depth <= 8) |
|
|
|
|
c->get_pixels = get_pixels_sse2; |
|
|
|
|
c->sum_abs_dctelem= sum_abs_dctelem_sse2; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|