mirror of https://github.com/FFmpeg/FFmpeg.git
parent
e0a8f31591
commit
3ced55d51c
9 changed files with 967 additions and 660 deletions
@ -0,0 +1,106 @@ |
||||
;****************************************************************************** |
||||
;* MMX optimized DSP utils |
||||
;* Copyright (c) 2008 Loren Merritt |
||||
;* Copyright (c) 2003-2013 Michael Niedermayer |
||||
;* Copyright (c) 2013 Daniel Kang |
||||
;* |
||||
;* This file is part of FFmpeg. |
||||
;* |
||||
;* FFmpeg is free software; you can redistribute it and/or |
||||
;* modify it under the terms of the GNU Lesser General Public |
||||
;* License as published by the Free Software Foundation; either |
||||
;* version 2.1 of the License, or (at your option) any later version. |
||||
;* |
||||
;* FFmpeg is distributed in the hope that it will be useful, |
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
||||
;* Lesser General Public License for more details. |
||||
;* |
||||
;* You should have received a copy of the GNU Lesser General Public |
||||
;* License along with FFmpeg; if not, write to the Free Software |
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
||||
;****************************************************************************** |
||||
|
||||
%include "libavutil/x86/x86util.asm" |
||||
|
||||
SECTION .text |
||||
|
||||
INIT_MMX mmxext |
||||
; void pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
||||
%macro PIXELS48 2 |
||||
%if %2 == 4 |
||||
%define OP movh |
||||
%else |
||||
%define OP mova |
||||
%endif |
||||
cglobal %1_pixels%2, 4,5 |
||||
movsxdifnidn r2, r2d |
||||
lea r4, [r2*3] |
||||
.loop: |
||||
OP m0, [r1] |
||||
OP m1, [r1+r2] |
||||
OP m2, [r1+r2*2] |
||||
OP m3, [r1+r4] |
||||
lea r1, [r1+r2*4] |
||||
%ifidn %1, avg |
||||
pavgb m0, [r0] |
||||
pavgb m1, [r0+r2] |
||||
pavgb m2, [r0+r2*2] |
||||
pavgb m3, [r0+r4] |
||||
%endif |
||||
OP [r0], m0 |
||||
OP [r0+r2], m1 |
||||
OP [r0+r2*2], m2 |
||||
OP [r0+r4], m3 |
||||
sub r3d, 4 |
||||
lea r0, [r0+r2*4] |
||||
jne .loop |
||||
RET |
||||
%endmacro |
||||
|
||||
PIXELS48 put, 4 |
||||
PIXELS48 avg, 4 |
||||
PIXELS48 put, 8 |
||||
PIXELS48 avg, 8 |
||||
|
||||
|
||||
INIT_XMM sse2 |
||||
; void put_pixels16_sse2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) |
||||
cglobal put_pixels16, 4,5,4 |
||||
lea r4, [r2*3] |
||||
.loop: |
||||
movu m0, [r1] |
||||
movu m1, [r1+r2] |
||||
movu m2, [r1+r2*2] |
||||
movu m3, [r1+r4] |
||||
lea r1, [r1+r2*4] |
||||
mova [r0], m0 |
||||
mova [r0+r2], m1 |
||||
mova [r0+r2*2], m2 |
||||
mova [r0+r4], m3 |
||||
sub r3d, 4 |
||||
lea r0, [r0+r2*4] |
||||
jnz .loop |
||||
REP_RET |
||||
|
||||
; void avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) |
||||
cglobal avg_pixels16, 4,5,4 |
||||
lea r4, [r2*3] |
||||
.loop: |
||||
movu m0, [r1] |
||||
movu m1, [r1+r2] |
||||
movu m2, [r1+r2*2] |
||||
movu m3, [r1+r4] |
||||
lea r1, [r1+r2*4] |
||||
pavgb m0, [r0] |
||||
pavgb m1, [r0+r2] |
||||
pavgb m2, [r0+r2*2] |
||||
pavgb m3, [r0+r4] |
||||
mova [r0], m0 |
||||
mova [r0+r2], m1 |
||||
mova [r0+r2*2], m2 |
||||
mova [r0+r4], m3 |
||||
sub r3d, 4 |
||||
lea r0, [r0+r2*4] |
||||
jnz .loop |
||||
REP_RET |
@ -0,0 +1,415 @@ |
||||
/*
|
||||
* MMX optimized DSP utils |
||||
* Copyright (c) 2000, 2001 Fabrice Bellard |
||||
* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> |
||||
* |
||||
* This file is part of FFmpeg. |
||||
* |
||||
* FFmpeg is free software; you can redistribute it and/or |
||||
* modify it under the terms of the GNU Lesser General Public |
||||
* License as published by the Free Software Foundation; either |
||||
* version 2.1 of the License, or (at your option) any later version. |
||||
* |
||||
* FFmpeg is distributed in the hope that it will be useful, |
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
||||
* Lesser General Public License for more details. |
||||
* |
||||
* You should have received a copy of the GNU Lesser General Public |
||||
* License along with FFmpeg; if not, write to the Free Software |
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
||||
* |
||||
* MMX optimization by Nick Kurshev <nickols_k@mail.ru> |
||||
*/ |
||||
|
||||
#include "libavutil/cpu.h" |
||||
#include "libavutil/x86/asm.h" |
||||
#include "libavcodec/hpeldsp.h" |
||||
#include "dsputil_mmx.h" |
||||
|
||||
//#undef NDEBUG
|
||||
//#include <assert.h>
|
||||
|
||||
#if HAVE_YASM |
||||
void ff_put_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels, |
||||
ptrdiff_t line_size, int h); |
||||
void ff_put_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels, |
||||
ptrdiff_t line_size, int h); |
||||
void ff_put_pixels16_x2_mmxext(uint8_t *block, const uint8_t *pixels, |
||||
ptrdiff_t line_size, int h); |
||||
void ff_put_pixels16_x2_3dnow(uint8_t *block, const uint8_t *pixels, |
||||
ptrdiff_t line_size, int h); |
||||
void ff_put_no_rnd_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels, |
||||
ptrdiff_t line_size, int h); |
||||
void ff_put_no_rnd_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels, |
||||
ptrdiff_t line_size, int h); |
||||
void ff_put_no_rnd_pixels8_x2_exact_mmxext(uint8_t *block, |
||||
const uint8_t *pixels, |
||||
ptrdiff_t line_size, int h); |
||||
void ff_put_no_rnd_pixels8_x2_exact_3dnow(uint8_t *block, |
||||
const uint8_t *pixels, |
||||
ptrdiff_t line_size, int h); |
||||
void ff_put_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels, |
||||
ptrdiff_t line_size, int h); |
||||
void ff_put_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels, |
||||
ptrdiff_t line_size, int h); |
||||
void ff_put_no_rnd_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels, |
||||
ptrdiff_t line_size, int h); |
||||
void ff_put_no_rnd_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels, |
||||
ptrdiff_t line_size, int h); |
||||
void ff_put_no_rnd_pixels8_y2_exact_mmxext(uint8_t *block, |
||||
const uint8_t *pixels, |
||||
ptrdiff_t line_size, int h); |
||||
void ff_put_no_rnd_pixels8_y2_exact_3dnow(uint8_t *block, |
||||
const uint8_t *pixels, |
||||
ptrdiff_t line_size, int h); |
||||
void ff_avg_pixels8_3dnow(uint8_t *block, const uint8_t *pixels, |
||||
ptrdiff_t line_size, int h); |
||||
void ff_avg_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels, |
||||
ptrdiff_t line_size, int h); |
||||
void ff_avg_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels, |
||||
ptrdiff_t line_size, int h); |
||||
void ff_avg_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels, |
||||
ptrdiff_t line_size, int h); |
||||
void ff_avg_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels, |
||||
ptrdiff_t line_size, int h); |
||||
void ff_avg_pixels8_xy2_mmxext(uint8_t *block, const uint8_t *pixels, |
||||
ptrdiff_t line_size, int h); |
||||
void ff_avg_pixels8_xy2_3dnow(uint8_t *block, const uint8_t *pixels, |
||||
ptrdiff_t line_size, int h); |
||||
#endif /* HAVE_YASM */ |
||||
|
||||
|
||||
#if HAVE_INLINE_ASM |
||||
|
||||
#define JUMPALIGN() __asm__ volatile (".p2align 3"::) |
||||
#define MOVQ_ZERO(regd) __asm__ volatile ("pxor %%"#regd", %%"#regd ::) |
||||
|
||||
#define MOVQ_BFE(regd) \ |
||||
__asm__ volatile ( \
|
||||
"pcmpeqd %%"#regd", %%"#regd" \n\t" \
|
||||
"paddb %%"#regd", %%"#regd" \n\t" ::) |
||||
|
||||
#ifndef PIC |
||||
#define MOVQ_BONE(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_bone)) |
||||
#define MOVQ_WTWO(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_wtwo)) |
||||
#else |
||||
// for shared library it's better to use this way for accessing constants
|
||||
// pcmpeqd -> -1
|
||||
#define MOVQ_BONE(regd) \ |
||||
__asm__ volatile ( \
|
||||
"pcmpeqd %%"#regd", %%"#regd" \n\t" \
|
||||
"psrlw $15, %%"#regd" \n\t" \
|
||||
"packuswb %%"#regd", %%"#regd" \n\t" ::) |
||||
|
||||
#define MOVQ_WTWO(regd) \ |
||||
__asm__ volatile ( \
|
||||
"pcmpeqd %%"#regd", %%"#regd" \n\t" \
|
||||
"psrlw $15, %%"#regd" \n\t" \
|
||||
"psllw $1, %%"#regd" \n\t"::) |
||||
|
||||
#endif |
||||
|
||||
// using regr as temporary and for the output result
|
||||
// first argument is unmodifed and second is trashed
|
||||
// regfe is supposed to contain 0xfefefefefefefefe
|
||||
#define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \ |
||||
"movq "#rega", "#regr" \n\t" \
|
||||
"pand "#regb", "#regr" \n\t" \
|
||||
"pxor "#rega", "#regb" \n\t" \
|
||||
"pand "#regfe", "#regb" \n\t" \
|
||||
"psrlq $1, "#regb" \n\t" \
|
||||
"paddb "#regb", "#regr" \n\t" |
||||
|
||||
#define PAVGB_MMX(rega, regb, regr, regfe) \ |
||||
"movq "#rega", "#regr" \n\t" \
|
||||
"por "#regb", "#regr" \n\t" \
|
||||
"pxor "#rega", "#regb" \n\t" \
|
||||
"pand "#regfe", "#regb" \n\t" \
|
||||
"psrlq $1, "#regb" \n\t" \
|
||||
"psubb "#regb", "#regr" \n\t" |
||||
|
||||
// mm6 is supposed to contain 0xfefefefefefefefe
|
||||
#define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \ |
||||
"movq "#rega", "#regr" \n\t" \
|
||||
"movq "#regc", "#regp" \n\t" \
|
||||
"pand "#regb", "#regr" \n\t" \
|
||||
"pand "#regd", "#regp" \n\t" \
|
||||
"pxor "#rega", "#regb" \n\t" \
|
||||
"pxor "#regc", "#regd" \n\t" \
|
||||
"pand %%mm6, "#regb" \n\t" \
|
||||
"pand %%mm6, "#regd" \n\t" \
|
||||
"psrlq $1, "#regb" \n\t" \
|
||||
"psrlq $1, "#regd" \n\t" \
|
||||
"paddb "#regb", "#regr" \n\t" \
|
||||
"paddb "#regd", "#regp" \n\t" |
||||
|
||||
#define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \ |
||||
"movq "#rega", "#regr" \n\t" \
|
||||
"movq "#regc", "#regp" \n\t" \
|
||||
"por "#regb", "#regr" \n\t" \
|
||||
"por "#regd", "#regp" \n\t" \
|
||||
"pxor "#rega", "#regb" \n\t" \
|
||||
"pxor "#regc", "#regd" \n\t" \
|
||||
"pand %%mm6, "#regb" \n\t" \
|
||||
"pand %%mm6, "#regd" \n\t" \
|
||||
"psrlq $1, "#regd" \n\t" \
|
||||
"psrlq $1, "#regb" \n\t" \
|
||||
"psubb "#regb", "#regr" \n\t" \
|
||||
"psubb "#regd", "#regp" \n\t" |
||||
|
||||
/***********************************/ |
||||
/* MMX no rounding */ |
||||
#define NO_RND 1 |
||||
#define DEF(x, y) x ## _no_rnd_ ## y ## _mmx |
||||
#define SET_RND MOVQ_WONE |
||||
#define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f) |
||||
#define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e) |
||||
#define OP_AVG(a, b, c, e) PAVGB_MMX(a, b, c, e) |
||||
|
||||
#include "hpeldsp_rnd_template.c" |
||||
|
||||
#undef DEF |
||||
#undef SET_RND |
||||
#undef PAVGBP |
||||
#undef PAVGB |
||||
#undef NO_RND |
||||
/***********************************/ |
||||
/* MMX rounding */ |
||||
|
||||
#define DEF(x, y) x ## _ ## y ## _mmx |
||||
#define SET_RND MOVQ_WTWO |
||||
#define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f) |
||||
#define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e) |
||||
|
||||
#include "hpeldsp_rnd_template.c" |
||||
|
||||
#undef DEF |
||||
#undef SET_RND |
||||
#undef PAVGBP |
||||
#undef PAVGB |
||||
#undef OP_AVG |
||||
|
||||
#endif /* HAVE_INLINE_ASM */ |
||||
|
||||
|
||||
#if HAVE_YASM |
||||
#define ff_put_pixels8_mmx ff_put_pixels8_mmxext |
||||
|
||||
/***********************************/ |
||||
/* 3Dnow specific */ |
||||
|
||||
#define DEF(x) x ## _3dnow |
||||
|
||||
#include "hpeldsp_avg_template.c" |
||||
|
||||
#undef DEF |
||||
|
||||
/***********************************/ |
||||
/* MMXEXT specific */ |
||||
|
||||
#define DEF(x) x ## _mmxext |
||||
|
||||
#include "hpeldsp_avg_template.c" |
||||
|
||||
#undef DEF |
||||
|
||||
#endif /* HAVE_YASM */ |
||||
|
||||
|
||||
#if HAVE_INLINE_ASM |
||||
#define put_no_rnd_pixels16_mmx put_pixels16_mmx |
||||
#define put_no_rnd_pixels8_mmx put_pixels8_mmx |
||||
#define put_pixels16_mmxext put_pixels16_mmx |
||||
#define put_pixels8_mmxext put_pixels8_mmx |
||||
#define put_pixels4_mmxext put_pixels4_mmx |
||||
#define put_no_rnd_pixels16_mmxext put_no_rnd_pixels16_mmx |
||||
#define put_no_rnd_pixels8_mmxext put_no_rnd_pixels8_mmx |
||||
|
||||
static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, |
||||
ptrdiff_t line_size, int h) |
||||
{ |
||||
__asm__ volatile ( |
||||
"lea (%3, %3), %%"REG_a" \n\t" |
||||
".p2align 3 \n\t" |
||||
"1: \n\t" |
||||
"movq (%1 ), %%mm0 \n\t" |
||||
"movq (%1, %3), %%mm1 \n\t" |
||||
"movq %%mm0, (%2) \n\t" |
||||
"movq %%mm1, (%2, %3) \n\t" |
||||
"add %%"REG_a", %1 \n\t" |
||||
"add %%"REG_a", %2 \n\t" |
||||
"movq (%1 ), %%mm0 \n\t" |
||||
"movq (%1, %3), %%mm1 \n\t" |
||||
"movq %%mm0, (%2) \n\t" |
||||
"movq %%mm1, (%2, %3) \n\t" |
||||
"add %%"REG_a", %1 \n\t" |
||||
"add %%"REG_a", %2 \n\t" |
||||
"subl $4, %0 \n\t" |
||||
"jnz 1b \n\t" |
||||
: "+g"(h), "+r"(pixels), "+r"(block) |
||||
: "r"((x86_reg)line_size) |
||||
: "%"REG_a, "memory" |
||||
); |
||||
} |
||||
|
||||
static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels, |
||||
ptrdiff_t line_size, int h) |
||||
{ |
||||
__asm__ volatile ( |
||||
"lea (%3, %3), %%"REG_a" \n\t" |
||||
".p2align 3 \n\t" |
||||
"1: \n\t" |
||||
"movq (%1 ), %%mm0 \n\t" |
||||
"movq 8(%1 ), %%mm4 \n\t" |
||||
"movq (%1, %3), %%mm1 \n\t" |
||||
"movq 8(%1, %3), %%mm5 \n\t" |
||||
"movq %%mm0, (%2) \n\t" |
||||
"movq %%mm4, 8(%2) \n\t" |
||||
"movq %%mm1, (%2, %3) \n\t" |
||||
"movq %%mm5, 8(%2, %3) \n\t" |
||||
"add %%"REG_a", %1 \n\t" |
||||
"add %%"REG_a", %2 \n\t" |
||||
"movq (%1 ), %%mm0 \n\t" |
||||
"movq 8(%1 ), %%mm4 \n\t" |
||||
"movq (%1, %3), %%mm1 \n\t" |
||||
"movq 8(%1, %3), %%mm5 \n\t" |
||||
"movq %%mm0, (%2) \n\t" |
||||
"movq %%mm4, 8(%2) \n\t" |
||||
"movq %%mm1, (%2, %3) \n\t" |
||||
"movq %%mm5, 8(%2, %3) \n\t" |
||||
"add %%"REG_a", %1 \n\t" |
||||
"add %%"REG_a", %2 \n\t" |
||||
"subl $4, %0 \n\t" |
||||
"jnz 1b \n\t" |
||||
: "+g"(h), "+r"(pixels), "+r"(block) |
||||
: "r"((x86_reg)line_size) |
||||
: "%"REG_a, "memory" |
||||
); |
||||
} |
||||
#endif /* HAVE_INLINE_ASM */ |
||||
|
||||
void ff_put_pixels16_sse2(uint8_t *block, const uint8_t *pixels, |
||||
ptrdiff_t line_size, int h); |
||||
void ff_avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels, |
||||
ptrdiff_t line_size, int h); |
||||
|
||||
#define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU) \ |
||||
do { \
|
||||
c->PFX ## _pixels_tab IDX [0] = PFX ## _pixels ## SIZE ## _ ## CPU; \
|
||||
c->PFX ## _pixels_tab IDX [1] = PFX ## _pixels ## SIZE ## _x2_ ## CPU; \
|
||||
c->PFX ## _pixels_tab IDX [2] = PFX ## _pixels ## SIZE ## _y2_ ## CPU; \
|
||||
c->PFX ## _pixels_tab IDX [3] = PFX ## _pixels ## SIZE ## _xy2_ ## CPU; \
|
||||
} while (0) |
||||
|
||||
static void hpeldsp_init_mmx(HpelDSPContext *c, int flags, int mm_flags) |
||||
{ |
||||
#if HAVE_INLINE_ASM |
||||
SET_HPEL_FUNCS(put, [0], 16, mmx); |
||||
SET_HPEL_FUNCS(put_no_rnd, [0], 16, mmx); |
||||
SET_HPEL_FUNCS(avg, [0], 16, mmx); |
||||
SET_HPEL_FUNCS(avg_no_rnd, , 16, mmx); |
||||
SET_HPEL_FUNCS(put, [1], 8, mmx); |
||||
SET_HPEL_FUNCS(put_no_rnd, [1], 8, mmx); |
||||
SET_HPEL_FUNCS(avg, [1], 8, mmx); |
||||
#endif /* HAVE_INLINE_ASM */ |
||||
} |
||||
|
||||
static void hpeldsp_init_mmxext(HpelDSPContext *c, int flags, int mm_flags) |
||||
{ |
||||
#if HAVE_YASM |
||||
c->put_pixels_tab[0][1] = ff_put_pixels16_x2_mmxext; |
||||
c->put_pixels_tab[0][2] = ff_put_pixels16_y2_mmxext; |
||||
|
||||
c->avg_pixels_tab[0][0] = ff_avg_pixels16_mmxext; |
||||
c->avg_pixels_tab[0][1] = ff_avg_pixels16_x2_mmxext; |
||||
c->avg_pixels_tab[0][2] = ff_avg_pixels16_y2_mmxext; |
||||
|
||||
c->put_pixels_tab[1][1] = ff_put_pixels8_x2_mmxext; |
||||
c->put_pixels_tab[1][2] = ff_put_pixels8_y2_mmxext; |
||||
|
||||
c->avg_pixels_tab[1][0] = ff_avg_pixels8_mmxext; |
||||
c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_mmxext; |
||||
c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_mmxext; |
||||
|
||||
if (!(flags & CODEC_FLAG_BITEXACT)) { |
||||
c->put_no_rnd_pixels_tab[0][1] = ff_put_no_rnd_pixels16_x2_mmxext; |
||||
c->put_no_rnd_pixels_tab[0][2] = ff_put_no_rnd_pixels16_y2_mmxext; |
||||
c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_mmxext; |
||||
c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_mmxext; |
||||
|
||||
c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_mmxext; |
||||
c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_mmxext; |
||||
} |
||||
#endif /* HAVE_YASM */ |
||||
|
||||
#if HAVE_MMXEXT_EXTERNAL |
||||
if (flags & CODEC_FLAG_BITEXACT && CONFIG_VP3_DECODER) { |
||||
c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_exact_mmxext; |
||||
c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_exact_mmxext; |
||||
} |
||||
#endif /* HAVE_MMXEXT_EXTERNAL */ |
||||
} |
||||
|
||||
static void hpeldsp_init_3dnow(HpelDSPContext *c, int flags, int mm_flags) |
||||
{ |
||||
#if HAVE_YASM |
||||
c->put_pixels_tab[0][1] = ff_put_pixels16_x2_3dnow; |
||||
c->put_pixels_tab[0][2] = ff_put_pixels16_y2_3dnow; |
||||
|
||||
c->avg_pixels_tab[0][0] = ff_avg_pixels16_3dnow; |
||||
c->avg_pixels_tab[0][1] = ff_avg_pixels16_x2_3dnow; |
||||
c->avg_pixels_tab[0][2] = ff_avg_pixels16_y2_3dnow; |
||||
|
||||
c->put_pixels_tab[1][1] = ff_put_pixels8_x2_3dnow; |
||||
c->put_pixels_tab[1][2] = ff_put_pixels8_y2_3dnow; |
||||
|
||||
c->avg_pixels_tab[1][0] = ff_avg_pixels8_3dnow; |
||||
c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_3dnow; |
||||
c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_3dnow; |
||||
|
||||
if (!(flags & CODEC_FLAG_BITEXACT)){ |
||||
c->put_no_rnd_pixels_tab[0][1] = ff_put_no_rnd_pixels16_x2_3dnow; |
||||
c->put_no_rnd_pixels_tab[0][2] = ff_put_no_rnd_pixels16_y2_3dnow; |
||||
c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_3dnow; |
||||
c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_3dnow; |
||||
|
||||
c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_3dnow; |
||||
c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_3dnow; |
||||
} |
||||
|
||||
if (flags & CODEC_FLAG_BITEXACT && CONFIG_VP3_DECODER) { |
||||
c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_exact_3dnow; |
||||
c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_exact_3dnow; |
||||
} |
||||
#endif /* HAVE_YASM */ |
||||
} |
||||
|
||||
static void hpeldsp_init_sse2(HpelDSPContext *c, int flags, int mm_flags) |
||||
{ |
||||
#if HAVE_SSE2_EXTERNAL |
||||
if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) { |
||||
// these functions are slower than mmx on AMD, but faster on Intel
|
||||
c->put_pixels_tab[0][0] = ff_put_pixels16_sse2; |
||||
c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_sse2; |
||||
c->avg_pixels_tab[0][0] = ff_avg_pixels16_sse2; |
||||
} |
||||
#endif /* HAVE_SSE2_EXTERNAL */ |
||||
} |
||||
|
||||
void ff_hpeldsp_init_x86(HpelDSPContext *c, int flags) |
||||
{ |
||||
int mm_flags = av_get_cpu_flags(); |
||||
|
||||
if (mm_flags & AV_CPU_FLAG_MMX) |
||||
hpeldsp_init_mmx(c, flags, mm_flags); |
||||
|
||||
if (mm_flags & AV_CPU_FLAG_MMXEXT) |
||||
hpeldsp_init_mmxext(c, flags, mm_flags); |
||||
|
||||
if (mm_flags & AV_CPU_FLAG_3DNOW) |
||||
hpeldsp_init_3dnow(c, flags, mm_flags); |
||||
|
||||
if (mm_flags & AV_CPU_FLAG_SSE2) |
||||
hpeldsp_init_sse2(c, flags, mm_flags); |
||||
} |
@ -0,0 +1,428 @@ |
||||
/*
|
||||
* DSP utils mmx functions are compiled twice for rnd/no_rnd |
||||
* Copyright (c) 2000, 2001 Fabrice Bellard |
||||
* Copyright (c) 2003-2004 Michael Niedermayer <michaelni@gmx.at> |
||||
* |
||||
* MMX optimization by Nick Kurshev <nickols_k@mail.ru> |
||||
* mostly rewritten by Michael Niedermayer <michaelni@gmx.at> |
||||
* and improved by Zdenek Kabelac <kabi@users.sf.net> |
||||
* |
||||
* This file is part of FFmpeg. |
||||
* |
||||
* FFmpeg is free software; you can redistribute it and/or |
||||
* modify it under the terms of the GNU Lesser General Public |
||||
* License as published by the Free Software Foundation; either |
||||
* version 2.1 of the License, or (at your option) any later version. |
||||
* |
||||
* FFmpeg is distributed in the hope that it will be useful, |
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
||||
* Lesser General Public License for more details. |
||||
* |
||||
* You should have received a copy of the GNU Lesser General Public |
||||
* License along with FFmpeg; if not, write to the Free Software |
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
||||
*/ |
||||
|
||||
// put_pixels
|
||||
static void DEF(put, pixels8_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) |
||||
{ |
||||
MOVQ_BFE(mm6); |
||||
__asm__ volatile( |
||||
"lea (%3, %3), %%"REG_a" \n\t" |
||||
".p2align 3 \n\t" |
||||
"1: \n\t" |
||||
"movq (%1), %%mm0 \n\t" |
||||
"movq 1(%1), %%mm1 \n\t" |
||||
"movq (%1, %3), %%mm2 \n\t" |
||||
"movq 1(%1, %3), %%mm3 \n\t" |
||||
PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) |
||||
"movq %%mm4, (%2) \n\t" |
||||
"movq %%mm5, (%2, %3) \n\t" |
||||
"add %%"REG_a", %1 \n\t" |
||||
"add %%"REG_a", %2 \n\t" |
||||
"movq (%1), %%mm0 \n\t" |
||||
"movq 1(%1), %%mm1 \n\t" |
||||
"movq (%1, %3), %%mm2 \n\t" |
||||
"movq 1(%1, %3), %%mm3 \n\t" |
||||
PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) |
||||
"movq %%mm4, (%2) \n\t" |
||||
"movq %%mm5, (%2, %3) \n\t" |
||||
"add %%"REG_a", %1 \n\t" |
||||
"add %%"REG_a", %2 \n\t" |
||||
"subl $4, %0 \n\t" |
||||
"jnz 1b \n\t" |
||||
:"+g"(h), "+S"(pixels), "+D"(block) |
||||
:"r"((x86_reg)line_size) |
||||
:REG_a, "memory"); |
||||
} |
||||
|
||||
static void DEF(put, pixels16_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) |
||||
{ |
||||
MOVQ_BFE(mm6); |
||||
__asm__ volatile( |
||||
"lea (%3, %3), %%"REG_a" \n\t" |
||||
".p2align 3 \n\t" |
||||
"1: \n\t" |
||||
"movq (%1), %%mm0 \n\t" |
||||
"movq 1(%1), %%mm1 \n\t" |
||||
"movq (%1, %3), %%mm2 \n\t" |
||||
"movq 1(%1, %3), %%mm3 \n\t" |
||||
PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) |
||||
"movq %%mm4, (%2) \n\t" |
||||
"movq %%mm5, (%2, %3) \n\t" |
||||
"movq 8(%1), %%mm0 \n\t" |
||||
"movq 9(%1), %%mm1 \n\t" |
||||
"movq 8(%1, %3), %%mm2 \n\t" |
||||
"movq 9(%1, %3), %%mm3 \n\t" |
||||
PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) |
||||
"movq %%mm4, 8(%2) \n\t" |
||||
"movq %%mm5, 8(%2, %3) \n\t" |
||||
"add %%"REG_a", %1 \n\t" |
||||
"add %%"REG_a", %2 \n\t" |
||||
"movq (%1), %%mm0 \n\t" |
||||
"movq 1(%1), %%mm1 \n\t" |
||||
"movq (%1, %3), %%mm2 \n\t" |
||||
"movq 1(%1, %3), %%mm3 \n\t" |
||||
PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) |
||||
"movq %%mm4, (%2) \n\t" |
||||
"movq %%mm5, (%2, %3) \n\t" |
||||
"movq 8(%1), %%mm0 \n\t" |
||||
"movq 9(%1), %%mm1 \n\t" |
||||
"movq 8(%1, %3), %%mm2 \n\t" |
||||
"movq 9(%1, %3), %%mm3 \n\t" |
||||
PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) |
||||
"movq %%mm4, 8(%2) \n\t" |
||||
"movq %%mm5, 8(%2, %3) \n\t" |
||||
"add %%"REG_a", %1 \n\t" |
||||
"add %%"REG_a", %2 \n\t" |
||||
"subl $4, %0 \n\t" |
||||
"jnz 1b \n\t" |
||||
:"+g"(h), "+S"(pixels), "+D"(block) |
||||
:"r"((x86_reg)line_size) |
||||
:REG_a, "memory"); |
||||
} |
||||
|
||||
static void DEF(put, pixels8_y2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) |
||||
{ |
||||
MOVQ_BFE(mm6); |
||||
__asm__ volatile( |
||||
"lea (%3, %3), %%"REG_a" \n\t" |
||||
"movq (%1), %%mm0 \n\t" |
||||
".p2align 3 \n\t" |
||||
"1: \n\t" |
||||
"movq (%1, %3), %%mm1 \n\t" |
||||
"movq (%1, %%"REG_a"),%%mm2 \n\t" |
||||
PAVGBP(%%mm1, %%mm0, %%mm4, %%mm2, %%mm1, %%mm5) |
||||
"movq %%mm4, (%2) \n\t" |
||||
"movq %%mm5, (%2, %3) \n\t" |
||||
"add %%"REG_a", %1 \n\t" |
||||
"add %%"REG_a", %2 \n\t" |
||||
"movq (%1, %3), %%mm1 \n\t" |
||||
"movq (%1, %%"REG_a"),%%mm0 \n\t" |
||||
PAVGBP(%%mm1, %%mm2, %%mm4, %%mm0, %%mm1, %%mm5) |
||||
"movq %%mm4, (%2) \n\t" |
||||
"movq %%mm5, (%2, %3) \n\t" |
||||
"add %%"REG_a", %1 \n\t" |
||||
"add %%"REG_a", %2 \n\t" |
||||
"subl $4, %0 \n\t" |
||||
"jnz 1b \n\t" |
||||
:"+g"(h), "+S"(pixels), "+D"(block) |
||||
:"r"((x86_reg)line_size) |
||||
:REG_a, "memory"); |
||||
} |
||||
|
||||
static void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) |
||||
{ |
||||
MOVQ_ZERO(mm7); |
||||
SET_RND(mm6); // =2 for rnd and =1 for no_rnd version
|
||||
__asm__ volatile( |
||||
"movq (%1), %%mm0 \n\t" |
||||
"movq 1(%1), %%mm4 \n\t" |
||||
"movq %%mm0, %%mm1 \n\t" |
||||
"movq %%mm4, %%mm5 \n\t" |
||||
"punpcklbw %%mm7, %%mm0 \n\t" |
||||
"punpcklbw %%mm7, %%mm4 \n\t" |
||||
"punpckhbw %%mm7, %%mm1 \n\t" |
||||
"punpckhbw %%mm7, %%mm5 \n\t" |
||||
"paddusw %%mm0, %%mm4 \n\t" |
||||
"paddusw %%mm1, %%mm5 \n\t" |
||||
"xor %%"REG_a", %%"REG_a" \n\t" |
||||
"add %3, %1 \n\t" |
||||
".p2align 3 \n\t" |
||||
"1: \n\t" |
||||
"movq (%1, %%"REG_a"), %%mm0 \n\t" |
||||
"movq 1(%1, %%"REG_a"), %%mm2 \n\t" |
||||
"movq %%mm0, %%mm1 \n\t" |
||||
"movq %%mm2, %%mm3 \n\t" |
||||
"punpcklbw %%mm7, %%mm0 \n\t" |
||||
"punpcklbw %%mm7, %%mm2 \n\t" |
||||
"punpckhbw %%mm7, %%mm1 \n\t" |
||||
"punpckhbw %%mm7, %%mm3 \n\t" |
||||
"paddusw %%mm2, %%mm0 \n\t" |
||||
"paddusw %%mm3, %%mm1 \n\t" |
||||
"paddusw %%mm6, %%mm4 \n\t" |
||||
"paddusw %%mm6, %%mm5 \n\t" |
||||
"paddusw %%mm0, %%mm4 \n\t" |
||||
"paddusw %%mm1, %%mm5 \n\t" |
||||
"psrlw $2, %%mm4 \n\t" |
||||
"psrlw $2, %%mm5 \n\t" |
||||
"packuswb %%mm5, %%mm4 \n\t" |
||||
"movq %%mm4, (%2, %%"REG_a") \n\t" |
||||
"add %3, %%"REG_a" \n\t" |
||||
|
||||
"movq (%1, %%"REG_a"), %%mm2 \n\t" // 0 <-> 2 1 <-> 3
|
||||
"movq 1(%1, %%"REG_a"), %%mm4 \n\t" |
||||
"movq %%mm2, %%mm3 \n\t" |
||||
"movq %%mm4, %%mm5 \n\t" |
||||
"punpcklbw %%mm7, %%mm2 \n\t" |
||||
"punpcklbw %%mm7, %%mm4 \n\t" |
||||
"punpckhbw %%mm7, %%mm3 \n\t" |
||||
"punpckhbw %%mm7, %%mm5 \n\t" |
||||
"paddusw %%mm2, %%mm4 \n\t" |
||||
"paddusw %%mm3, %%mm5 \n\t" |
||||
"paddusw %%mm6, %%mm0 \n\t" |
||||
"paddusw %%mm6, %%mm1 \n\t" |
||||
"paddusw %%mm4, %%mm0 \n\t" |
||||
"paddusw %%mm5, %%mm1 \n\t" |
||||
"psrlw $2, %%mm0 \n\t" |
||||
"psrlw $2, %%mm1 \n\t" |
||||
"packuswb %%mm1, %%mm0 \n\t" |
||||
"movq %%mm0, (%2, %%"REG_a") \n\t" |
||||
"add %3, %%"REG_a" \n\t" |
||||
|
||||
"subl $2, %0 \n\t" |
||||
"jnz 1b \n\t" |
||||
:"+g"(h), "+S"(pixels) |
||||
:"D"(block), "r"((x86_reg)line_size) |
||||
:REG_a, "memory"); |
||||
} |
||||
|
||||
// avg_pixels
|
||||
#ifndef NO_RND |
||||
// in case more speed is needed - unroling would certainly help
|
||||
static void DEF(avg, pixels8)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) |
||||
{ |
||||
MOVQ_BFE(mm6); |
||||
JUMPALIGN(); |
||||
do { |
||||
__asm__ volatile( |
||||
"movq %0, %%mm0 \n\t" |
||||
"movq %1, %%mm1 \n\t" |
||||
OP_AVG(%%mm0, %%mm1, %%mm2, %%mm6) |
||||
"movq %%mm2, %0 \n\t" |
||||
:"+m"(*block) |
||||
:"m"(*pixels) |
||||
:"memory"); |
||||
pixels += line_size; |
||||
block += line_size; |
||||
} |
||||
while (--h); |
||||
} |
||||
#endif // NO_RND
|
||||
|
||||
static void DEF(avg, pixels16)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) |
||||
{ |
||||
MOVQ_BFE(mm6); |
||||
JUMPALIGN(); |
||||
do { |
||||
__asm__ volatile( |
||||
"movq %0, %%mm0 \n\t" |
||||
"movq %1, %%mm1 \n\t" |
||||
OP_AVG(%%mm0, %%mm1, %%mm2, %%mm6) |
||||
"movq %%mm2, %0 \n\t" |
||||
"movq 8%0, %%mm0 \n\t" |
||||
"movq 8%1, %%mm1 \n\t" |
||||
OP_AVG(%%mm0, %%mm1, %%mm2, %%mm6) |
||||
"movq %%mm2, 8%0 \n\t" |
||||
:"+m"(*block) |
||||
:"m"(*pixels) |
||||
:"memory"); |
||||
pixels += line_size; |
||||
block += line_size; |
||||
} |
||||
while (--h); |
||||
} |
||||
|
||||
#ifndef NO_RND |
||||
static void DEF(avg, pixels8_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) |
||||
{ |
||||
MOVQ_BFE(mm6); |
||||
JUMPALIGN(); |
||||
do { |
||||
__asm__ volatile( |
||||
"movq %1, %%mm0 \n\t" |
||||
"movq 1%1, %%mm1 \n\t" |
||||
"movq %0, %%mm3 \n\t" |
||||
PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) |
||||
OP_AVG(%%mm3, %%mm2, %%mm0, %%mm6) |
||||
"movq %%mm0, %0 \n\t" |
||||
:"+m"(*block) |
||||
:"m"(*pixels) |
||||
:"memory"); |
||||
pixels += line_size; |
||||
block += line_size; |
||||
} while (--h); |
||||
} |
||||
#endif // NO_RND
|
||||
|
||||
static void DEF(avg, pixels16_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) |
||||
{ |
||||
MOVQ_BFE(mm6); |
||||
JUMPALIGN(); |
||||
do { |
||||
__asm__ volatile( |
||||
"movq %1, %%mm0 \n\t" |
||||
"movq 1%1, %%mm1 \n\t" |
||||
"movq %0, %%mm3 \n\t" |
||||
PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) |
||||
OP_AVG(%%mm3, %%mm2, %%mm0, %%mm6) |
||||
"movq %%mm0, %0 \n\t" |
||||
"movq 8%1, %%mm0 \n\t" |
||||
"movq 9%1, %%mm1 \n\t" |
||||
"movq 8%0, %%mm3 \n\t" |
||||
PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) |
||||
OP_AVG(%%mm3, %%mm2, %%mm0, %%mm6) |
||||
"movq %%mm0, 8%0 \n\t" |
||||
:"+m"(*block) |
||||
:"m"(*pixels) |
||||
:"memory"); |
||||
pixels += line_size; |
||||
block += line_size; |
||||
} while (--h); |
||||
} |
||||
|
||||
static void DEF(avg, pixels8_y2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) |
||||
{ |
||||
MOVQ_BFE(mm6); |
||||
__asm__ volatile( |
||||
"lea (%3, %3), %%"REG_a" \n\t" |
||||
"movq (%1), %%mm0 \n\t" |
||||
".p2align 3 \n\t" |
||||
"1: \n\t" |
||||
"movq (%1, %3), %%mm1 \n\t" |
||||
"movq (%1, %%"REG_a"), %%mm2 \n\t" |
||||
PAVGBP(%%mm1, %%mm0, %%mm4, %%mm2, %%mm1, %%mm5) |
||||
"movq (%2), %%mm3 \n\t" |
||||
OP_AVG(%%mm3, %%mm4, %%mm0, %%mm6) |
||||
"movq (%2, %3), %%mm3 \n\t" |
||||
OP_AVG(%%mm3, %%mm5, %%mm1, %%mm6) |
||||
"movq %%mm0, (%2) \n\t" |
||||
"movq %%mm1, (%2, %3) \n\t" |
||||
"add %%"REG_a", %1 \n\t" |
||||
"add %%"REG_a", %2 \n\t" |
||||
|
||||
"movq (%1, %3), %%mm1 \n\t" |
||||
"movq (%1, %%"REG_a"), %%mm0 \n\t" |
||||
PAVGBP(%%mm1, %%mm2, %%mm4, %%mm0, %%mm1, %%mm5) |
||||
"movq (%2), %%mm3 \n\t" |
||||
OP_AVG(%%mm3, %%mm4, %%mm2, %%mm6) |
||||
"movq (%2, %3), %%mm3 \n\t" |
||||
OP_AVG(%%mm3, %%mm5, %%mm1, %%mm6) |
||||
"movq %%mm2, (%2) \n\t" |
||||
"movq %%mm1, (%2, %3) \n\t" |
||||
"add %%"REG_a", %1 \n\t" |
||||
"add %%"REG_a", %2 \n\t" |
||||
|
||||
"subl $4, %0 \n\t" |
||||
"jnz 1b \n\t" |
||||
:"+g"(h), "+S"(pixels), "+D"(block) |
||||
:"r"((x86_reg)line_size) |
||||
:REG_a, "memory"); |
||||
} |
||||
|
||||
// this routine is 'slightly' suboptimal but mostly unused
|
||||
static void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) |
||||
{ |
||||
MOVQ_ZERO(mm7); |
||||
SET_RND(mm6); // =2 for rnd and =1 for no_rnd version
|
||||
__asm__ volatile( |
||||
"movq (%1), %%mm0 \n\t" |
||||
"movq 1(%1), %%mm4 \n\t" |
||||
"movq %%mm0, %%mm1 \n\t" |
||||
"movq %%mm4, %%mm5 \n\t" |
||||
"punpcklbw %%mm7, %%mm0 \n\t" |
||||
"punpcklbw %%mm7, %%mm4 \n\t" |
||||
"punpckhbw %%mm7, %%mm1 \n\t" |
||||
"punpckhbw %%mm7, %%mm5 \n\t" |
||||
"paddusw %%mm0, %%mm4 \n\t" |
||||
"paddusw %%mm1, %%mm5 \n\t" |
||||
"xor %%"REG_a", %%"REG_a" \n\t" |
||||
"add %3, %1 \n\t" |
||||
".p2align 3 \n\t" |
||||
"1: \n\t" |
||||
"movq (%1, %%"REG_a"), %%mm0 \n\t" |
||||
"movq 1(%1, %%"REG_a"), %%mm2 \n\t" |
||||
"movq %%mm0, %%mm1 \n\t" |
||||
"movq %%mm2, %%mm3 \n\t" |
||||
"punpcklbw %%mm7, %%mm0 \n\t" |
||||
"punpcklbw %%mm7, %%mm2 \n\t" |
||||
"punpckhbw %%mm7, %%mm1 \n\t" |
||||
"punpckhbw %%mm7, %%mm3 \n\t" |
||||
"paddusw %%mm2, %%mm0 \n\t" |
||||
"paddusw %%mm3, %%mm1 \n\t" |
||||
"paddusw %%mm6, %%mm4 \n\t" |
||||
"paddusw %%mm6, %%mm5 \n\t" |
||||
"paddusw %%mm0, %%mm4 \n\t" |
||||
"paddusw %%mm1, %%mm5 \n\t" |
||||
"psrlw $2, %%mm4 \n\t" |
||||
"psrlw $2, %%mm5 \n\t" |
||||
"movq (%2, %%"REG_a"), %%mm3 \n\t" |
||||
"packuswb %%mm5, %%mm4 \n\t" |
||||
"pcmpeqd %%mm2, %%mm2 \n\t" |
||||
"paddb %%mm2, %%mm2 \n\t" |
||||
OP_AVG(%%mm3, %%mm4, %%mm5, %%mm2) |
||||
"movq %%mm5, (%2, %%"REG_a") \n\t" |
||||
"add %3, %%"REG_a" \n\t" |
||||
|
||||
"movq (%1, %%"REG_a"), %%mm2 \n\t" // 0 <-> 2 1 <-> 3
|
||||
"movq 1(%1, %%"REG_a"), %%mm4 \n\t" |
||||
"movq %%mm2, %%mm3 \n\t" |
||||
"movq %%mm4, %%mm5 \n\t" |
||||
"punpcklbw %%mm7, %%mm2 \n\t" |
||||
"punpcklbw %%mm7, %%mm4 \n\t" |
||||
"punpckhbw %%mm7, %%mm3 \n\t" |
||||
"punpckhbw %%mm7, %%mm5 \n\t" |
||||
"paddusw %%mm2, %%mm4 \n\t" |
||||
"paddusw %%mm3, %%mm5 \n\t" |
||||
"paddusw %%mm6, %%mm0 \n\t" |
||||
"paddusw %%mm6, %%mm1 \n\t" |
||||
"paddusw %%mm4, %%mm0 \n\t" |
||||
"paddusw %%mm5, %%mm1 \n\t" |
||||
"psrlw $2, %%mm0 \n\t" |
||||
"psrlw $2, %%mm1 \n\t" |
||||
"movq (%2, %%"REG_a"), %%mm3 \n\t" |
||||
"packuswb %%mm1, %%mm0 \n\t" |
||||
"pcmpeqd %%mm2, %%mm2 \n\t" |
||||
"paddb %%mm2, %%mm2 \n\t" |
||||
OP_AVG(%%mm3, %%mm0, %%mm1, %%mm2) |
||||
"movq %%mm1, (%2, %%"REG_a") \n\t" |
||||
"add %3, %%"REG_a" \n\t" |
||||
|
||||
"subl $2, %0 \n\t" |
||||
"jnz 1b \n\t" |
||||
:"+g"(h), "+S"(pixels) |
||||
:"D"(block), "r"((x86_reg)line_size) |
||||
:REG_a, "memory"); |
||||
} |
||||
|
||||
//FIXME optimize
|
||||
static void DEF(put, pixels16_y2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h){ |
||||
DEF(put, pixels8_y2)(block , pixels , line_size, h); |
||||
DEF(put, pixels8_y2)(block+8, pixels+8, line_size, h); |
||||
} |
||||
|
||||
static void DEF(put, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h){ |
||||
DEF(put, pixels8_xy2)(block , pixels , line_size, h); |
||||
DEF(put, pixels8_xy2)(block+8, pixels+8, line_size, h); |
||||
} |
||||
|
||||
static void DEF(avg, pixels16_y2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h){ |
||||
DEF(avg, pixels8_y2)(block , pixels , line_size, h); |
||||
DEF(avg, pixels8_y2)(block+8, pixels+8, line_size, h); |
||||
} |
||||
|
||||
static void DEF(avg, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h){ |
||||
DEF(avg, pixels8_xy2)(block , pixels , line_size, h); |
||||
DEF(avg, pixels8_xy2)(block+8, pixels+8, line_size, h); |
||||
} |
Loading…
Reference in new issue