vp8: convert idct/mc x86 assembly to use cpuflags().

pull/3/merge
Ronald S. Bultje 13 years ago
parent 8249a23fc1
commit e25be47154
  1. 112
      libavcodec/x86/vp8dsp-init.c
  2. 128
      libavcodec/x86/vp8dsp.asm

@ -29,16 +29,16 @@
/*
* MC functions
*/
extern void ff_put_vp8_epel4_h4_mmxext(uint8_t *dst, ptrdiff_t dststride,
extern void ff_put_vp8_epel4_h4_mmx2 (uint8_t *dst, ptrdiff_t dststride,
uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my);
extern void ff_put_vp8_epel4_h6_mmxext(uint8_t *dst, ptrdiff_t dststride,
extern void ff_put_vp8_epel4_h6_mmx2 (uint8_t *dst, ptrdiff_t dststride,
uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my);
extern void ff_put_vp8_epel4_v4_mmxext(uint8_t *dst, ptrdiff_t dststride,
extern void ff_put_vp8_epel4_v4_mmx2 (uint8_t *dst, ptrdiff_t dststride,
uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my);
extern void ff_put_vp8_epel4_v6_mmxext(uint8_t *dst, ptrdiff_t dststride,
extern void ff_put_vp8_epel4_v6_mmx2 (uint8_t *dst, ptrdiff_t dststride,
uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my);
@ -80,7 +80,7 @@ extern void ff_put_vp8_epel8_v6_ssse3 (uint8_t *dst, ptrdiff_t dststride,
uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my);
extern void ff_put_vp8_bilinear4_h_mmxext(uint8_t *dst, ptrdiff_t dststride,
extern void ff_put_vp8_bilinear4_h_mmx2 (uint8_t *dst, ptrdiff_t dststride,
uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my);
extern void ff_put_vp8_bilinear8_h_sse2 (uint8_t *dst, ptrdiff_t dststride,
@ -93,7 +93,7 @@ extern void ff_put_vp8_bilinear8_h_ssse3 (uint8_t *dst, ptrdiff_t dststride,
uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my);
extern void ff_put_vp8_bilinear4_v_mmxext(uint8_t *dst, ptrdiff_t dststride,
extern void ff_put_vp8_bilinear4_v_mmx2 (uint8_t *dst, ptrdiff_t dststride,
uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my);
extern void ff_put_vp8_bilinear8_v_sse2 (uint8_t *dst, ptrdiff_t dststride,
@ -139,27 +139,27 @@ static void ff_put_vp8_ ## FILTERTYPE ## 8_ ## TAPTYPE ## _ ## OPT( \
}
#if ARCH_X86_32
TAP_W8 (mmxext, epel, h4)
TAP_W8 (mmxext, epel, h6)
TAP_W16(mmxext, epel, h6)
TAP_W8 (mmxext, epel, v4)
TAP_W8 (mmxext, epel, v6)
TAP_W16(mmxext, epel, v6)
TAP_W8 (mmxext, bilinear, h)
TAP_W16(mmxext, bilinear, h)
TAP_W8 (mmxext, bilinear, v)
TAP_W16(mmxext, bilinear, v)
TAP_W8 (mmx2, epel, h4)
TAP_W8 (mmx2, epel, h6)
TAP_W16(mmx2, epel, h6)
TAP_W8 (mmx2, epel, v4)
TAP_W8 (mmx2, epel, v6)
TAP_W16(mmx2, epel, v6)
TAP_W8 (mmx2, bilinear, h)
TAP_W16(mmx2, bilinear, h)
TAP_W8 (mmx2, bilinear, v)
TAP_W16(mmx2, bilinear, v)
#endif
TAP_W16(sse2, epel, h6)
TAP_W16(sse2, epel, v6)
TAP_W16(sse2, bilinear, h)
TAP_W16(sse2, bilinear, v)
TAP_W16(sse2, epel, h6)
TAP_W16(sse2, epel, v6)
TAP_W16(sse2, bilinear, h)
TAP_W16(sse2, bilinear, v)
TAP_W16(ssse3, epel, h6)
TAP_W16(ssse3, epel, v6)
TAP_W16(ssse3, bilinear, h)
TAP_W16(ssse3, bilinear, v)
TAP_W16(ssse3, epel, h6)
TAP_W16(ssse3, epel, v6)
TAP_W16(ssse3, bilinear, h)
TAP_W16(ssse3, bilinear, v)
#define HVTAP(OPT, ALIGN, TAPNUMX, TAPNUMY, SIZE, MAXHEIGHT) \
static void ff_put_vp8_epel ## SIZE ## _h ## TAPNUMX ## v ## TAPNUMY ## _ ## OPT( \
@ -177,13 +177,13 @@ static void ff_put_vp8_epel ## SIZE ## _h ## TAPNUMX ## v ## TAPNUMY ## _ ## OPT
#if ARCH_X86_32
#define HVTAPMMX(x, y) \
HVTAP(mmxext, 8, x, y, 4, 8) \
HVTAP(mmxext, 8, x, y, 8, 16)
HVTAP(mmx2, 8, x, y, 4, 8) \
HVTAP(mmx2, 8, x, y, 8, 16)
HVTAP(mmxext, 8, 6, 6, 16, 16)
HVTAP(mmx2, 8, 6, 6, 16, 16)
#else
#define HVTAPMMX(x, y) \
HVTAP(mmxext, 8, x, y, 4, 8)
HVTAP(mmx2, 8, x, y, 4, 8)
#endif
HVTAPMMX(4, 4)
@ -218,16 +218,16 @@ static void ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT( \
dst, dststride, tmp, SIZE, height, mx, my); \
}
HVBILIN(mmxext, 8, 4, 8)
HVBILIN(mmx2, 8, 4, 8)
#if ARCH_X86_32
HVBILIN(mmxext, 8, 8, 16)
HVBILIN(mmxext, 8, 16, 16)
HVBILIN(mmx2, 8, 8, 16)
HVBILIN(mmx2, 8, 16, 16)
#endif
HVBILIN(sse2, 8, 8, 16)
HVBILIN(sse2, 8, 16, 16)
HVBILIN(ssse3, 8, 4, 8)
HVBILIN(ssse3, 8, 8, 16)
HVBILIN(ssse3, 8, 16, 16)
HVBILIN(sse2, 8, 8, 16)
HVBILIN(sse2, 8, 16, 16)
HVBILIN(ssse3, 8, 4, 8)
HVBILIN(ssse3, 8, 8, 16)
HVBILIN(ssse3, 8, 16, 16)
extern void ff_vp8_idct_dc_add_mmx(uint8_t *dst, DCTELEM block[16],
ptrdiff_t stride);
@ -283,7 +283,7 @@ extern void ff_vp8_h_loop_filter8uv_mbedge_ ## NAME(uint8_t *dstU, \
int e, int i, int hvt);
DECLARE_LOOP_FILTER(mmx)
DECLARE_LOOP_FILTER(mmxext)
DECLARE_LOOP_FILTER(mmx2)
DECLARE_LOOP_FILTER(sse2)
DECLARE_LOOP_FILTER(ssse3)
DECLARE_LOOP_FILTER(sse4)
@ -351,26 +351,26 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c)
/* note that 4-tap width=16 functions are missing because w=16
* is only used for luma, and luma is always a copy or sixtap. */
if (mm_flags & AV_CPU_FLAG_MMX2) {
VP8_MC_FUNC(2, 4, mmxext);
VP8_BILINEAR_MC_FUNC(2, 4, mmxext);
VP8_MC_FUNC(2, 4, mmx2);
VP8_BILINEAR_MC_FUNC(2, 4, mmx2);
#if ARCH_X86_32
VP8_LUMA_MC_FUNC(0, 16, mmxext);
VP8_MC_FUNC(1, 8, mmxext);
VP8_BILINEAR_MC_FUNC(0, 16, mmxext);
VP8_BILINEAR_MC_FUNC(1, 8, mmxext);
c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_mmxext;
c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_mmxext;
c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_mmxext;
c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_mmxext;
c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_mmxext;
c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_mmxext;
c->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16y_mbedge_mmxext;
c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_mmxext;
c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_mmxext;
c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_mmxext;
VP8_LUMA_MC_FUNC(0, 16, mmx2);
VP8_MC_FUNC(1, 8, mmx2);
VP8_BILINEAR_MC_FUNC(0, 16, mmx2);
VP8_BILINEAR_MC_FUNC(1, 8, mmx2);
c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_mmx2;
c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_mmx2;
c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_mmx2;
c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_mmx2;
c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_mmx2;
c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_mmx2;
c->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16y_mbedge_mmx2;
c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_mmx2;
c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_mmx2;
c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_mmx2;
#endif
}

@ -173,8 +173,8 @@ SECTION .text
; int height, int mx, int my);
;-----------------------------------------------------------------------------
%macro FILTER_SSSE3 3
cglobal put_vp8_epel%1_h6_ssse3, 6, 6, %2
%macro FILTER_SSSE3 1
cglobal put_vp8_epel%1_h6, 6, 6, 8
lea r5d, [r5*3]
mova m3, [filter_h6_shuf2]
mova m4, [filter_h6_shuf3]
@ -189,7 +189,7 @@ cglobal put_vp8_epel%1_h6_ssse3, 6, 6, %2
movu m0, [r2-2]
mova m1, m0
mova m2, m0
%ifidn %1, 4
%if mmsize == 8
; For epel4, we need 9 bytes, but only 8 get loaded; to compensate, do the
; shuffle with a memory operand
punpcklbw m0, [r2+3]
@ -215,7 +215,7 @@ cglobal put_vp8_epel%1_h6_ssse3, 6, 6, %2
jg .nextrow
REP_RET
cglobal put_vp8_epel%1_h4_ssse3, 6, 6, %3
cglobal put_vp8_epel%1_h4, 6, 6, 7
shl r5d, 4
mova m2, [pw_64]
mova m3, [filter_h2_shuf]
@ -246,7 +246,7 @@ cglobal put_vp8_epel%1_h4_ssse3, 6, 6, %3
jg .nextrow
REP_RET
cglobal put_vp8_epel%1_v4_ssse3, 7, 7, %2
cglobal put_vp8_epel%1_v4, 7, 7, 8
shl r6d, 4
%ifdef PIC
lea r11, [fourtap_filter_hb_m]
@ -285,7 +285,7 @@ cglobal put_vp8_epel%1_v4_ssse3, 7, 7, %2
jg .nextrow
REP_RET
cglobal put_vp8_epel%1_v6_ssse3, 7, 7, %2
cglobal put_vp8_epel%1_v6, 7, 7, 8
lea r6d, [r6*3]
%ifdef PIC
lea r11, [sixtap_filter_hb_m]
@ -333,13 +333,14 @@ cglobal put_vp8_epel%1_v6_ssse3, 7, 7, %2
REP_RET
%endmacro
INIT_MMX
FILTER_SSSE3 4, 0, 0
INIT_XMM
FILTER_SSSE3 8, 8, 7
INIT_MMX ssse3
FILTER_SSSE3 4
INIT_XMM ssse3
FILTER_SSSE3 8
; 4x4 block, H-only 4-tap filter
cglobal put_vp8_epel4_h4_mmxext, 6, 6
INIT_MMX mmx2
cglobal put_vp8_epel4_h4, 6, 6
shl r5d, 4
%ifdef PIC
lea r11, [fourtap_filter_hw_m]
@ -386,7 +387,8 @@ cglobal put_vp8_epel4_h4_mmxext, 6, 6
REP_RET
; 4x4 block, H-only 6-tap filter
cglobal put_vp8_epel4_h6_mmxext, 6, 6
INIT_MMX mmx2
cglobal put_vp8_epel4_h6, 6, 6
lea r5d, [r5*3]
%ifdef PIC
lea r11, [sixtap_filter_hw_m]
@ -442,8 +444,8 @@ cglobal put_vp8_epel4_h6_mmxext, 6, 6
jg .nextrow
REP_RET
INIT_XMM
cglobal put_vp8_epel8_h4_sse2, 6, 6, 10
INIT_XMM sse2
cglobal put_vp8_epel8_h4, 6, 6, 10
shl r5d, 5
%ifdef PIC
lea r11, [fourtap_filter_v_m]
@ -490,7 +492,8 @@ cglobal put_vp8_epel8_h4_sse2, 6, 6, 10
jg .nextrow
REP_RET
cglobal put_vp8_epel8_h6_sse2, 6, 6, 14
INIT_XMM sse2
cglobal put_vp8_epel8_h6, 6, 6, 14
lea r5d, [r5*3]
shl r5d, 4
%ifdef PIC
@ -552,9 +555,9 @@ cglobal put_vp8_epel8_h6_sse2, 6, 6, 14
jg .nextrow
REP_RET
%macro FILTER_V 3
%macro FILTER_V 1
; 4x4 block, V-only 4-tap filter
cglobal put_vp8_epel%2_v4_%1, 7, 7, %3
cglobal put_vp8_epel%1_v4, 7, 7, 8
shl r6d, 5
%ifdef PIC
lea r11, [fourtap_filter_v_m]
@ -607,7 +610,7 @@ cglobal put_vp8_epel%2_v4_%1, 7, 7, %3
; 4x4 block, V-only 6-tap filter
cglobal put_vp8_epel%2_v6_%1, 7, 7, %3
cglobal put_vp8_epel%1_v6, 7, 7, 8
shl r6d, 4
lea r6, [r6*3]
%ifdef PIC
@ -671,13 +674,13 @@ cglobal put_vp8_epel%2_v6_%1, 7, 7, %3
REP_RET
%endmacro
INIT_MMX
FILTER_V mmxext, 4, 0
INIT_XMM
FILTER_V sse2, 8, 8
INIT_MMX mmx2
FILTER_V 4
INIT_XMM sse2
FILTER_V 8
%macro FILTER_BILINEAR 3
cglobal put_vp8_bilinear%2_v_%1, 7,7,%3
%macro FILTER_BILINEAR 1
cglobal put_vp8_bilinear%1_v, 7, 7, 7
mov r5d, 8*16
shl r6d, 4
sub r5d, r6d
@ -705,7 +708,7 @@ cglobal put_vp8_bilinear%2_v_%1, 7,7,%3
psraw m2, 2
pavgw m0, m6
pavgw m2, m6
%ifidn %1, mmxext
%if mmsize == 8
packuswb m0, m0
packuswb m2, m2
movh [r0+r1*0], m0
@ -722,7 +725,7 @@ cglobal put_vp8_bilinear%2_v_%1, 7,7,%3
jg .nextrow
REP_RET
cglobal put_vp8_bilinear%2_h_%1, 7,7,%3
cglobal put_vp8_bilinear%1_h, 7, 7, 7
mov r6d, 8*16
shl r5d, 4
sub r6d, r5d
@ -751,7 +754,7 @@ cglobal put_vp8_bilinear%2_h_%1, 7,7,%3
psraw m2, 2
pavgw m0, m6
pavgw m2, m6
%ifidn %1, mmxext
%if mmsize == 8
packuswb m0, m0
packuswb m2, m2
movh [r0+r1*0], m0
@ -769,13 +772,13 @@ cglobal put_vp8_bilinear%2_h_%1, 7,7,%3
REP_RET
%endmacro
INIT_MMX
FILTER_BILINEAR mmxext, 4, 0
INIT_XMM
FILTER_BILINEAR sse2, 8, 7
INIT_MMX mmx2
FILTER_BILINEAR 4
INIT_XMM sse2
FILTER_BILINEAR 8
%macro FILTER_BILINEAR_SSSE3 1
cglobal put_vp8_bilinear%1_v_ssse3, 7,7
cglobal put_vp8_bilinear%1_v, 7, 7, 5
shl r6d, 4
%ifdef PIC
lea r11, [bilinear_filter_vb_m]
@ -811,7 +814,7 @@ cglobal put_vp8_bilinear%1_v_ssse3, 7,7
jg .nextrow
REP_RET
cglobal put_vp8_bilinear%1_h_ssse3, 7,7
cglobal put_vp8_bilinear%1_h, 7, 7, 5
shl r5d, 4
%ifdef PIC
lea r11, [bilinear_filter_vb_m]
@ -848,12 +851,13 @@ cglobal put_vp8_bilinear%1_h_ssse3, 7,7
REP_RET
%endmacro
INIT_MMX
INIT_MMX ssse3
FILTER_BILINEAR_SSSE3 4
INIT_XMM
INIT_XMM ssse3
FILTER_BILINEAR_SSSE3 8
cglobal put_vp8_pixels8_mmx, 5,5
INIT_MMX mmx
cglobal put_vp8_pixels8, 5,5
.nextrow:
movq mm0, [r2+r3*0]
movq mm1, [r2+r3*1]
@ -866,7 +870,8 @@ cglobal put_vp8_pixels8_mmx, 5,5
REP_RET
%if ARCH_X86_32
cglobal put_vp8_pixels16_mmx, 5,5
INIT_MMX mmx
cglobal put_vp8_pixels16, 5,5
.nextrow:
movq mm0, [r2+r3*0+0]
movq mm1, [r2+r3*0+8]
@ -883,7 +888,8 @@ cglobal put_vp8_pixels16_mmx, 5,5
REP_RET
%endif
cglobal put_vp8_pixels16_sse, 5,5,2
INIT_XMM sse
cglobal put_vp8_pixels16, 5,5,2
.nextrow:
movups xmm0, [r2+r3*0]
movups xmm1, [r2+r3*1]
@ -918,8 +924,8 @@ cglobal put_vp8_pixels16_sse, 5,5,2
%4 [r1+r2+%3], m5
%endmacro
INIT_MMX
cglobal vp8_idct_dc_add_mmx, 3, 3
INIT_MMX mmx
cglobal vp8_idct_dc_add, 3, 3
; load data
movd m0, [r1]
@ -941,8 +947,8 @@ cglobal vp8_idct_dc_add_mmx, 3, 3
ADD_DC m0, m1, 0, movh
RET
INIT_XMM
cglobal vp8_idct_dc_add_sse4, 3, 3, 6
INIT_XMM sse4
cglobal vp8_idct_dc_add, 3, 3, 6
; load data
movd m0, [r1]
pxor m1, m1
@ -976,8 +982,8 @@ cglobal vp8_idct_dc_add_sse4, 3, 3, 6
;-----------------------------------------------------------------------------
%if ARCH_X86_32
INIT_MMX
cglobal vp8_idct_dc_add4y_mmx, 3, 3
INIT_MMX mmx
cglobal vp8_idct_dc_add4y, 3, 3
; load data
movd m0, [r1+32*0] ; A
movd m1, [r1+32*2] ; C
@ -1012,8 +1018,8 @@ cglobal vp8_idct_dc_add4y_mmx, 3, 3
RET
%endif
INIT_XMM
cglobal vp8_idct_dc_add4y_sse2, 3, 3, 6
INIT_XMM sse2
cglobal vp8_idct_dc_add4y, 3, 3, 6
; load data
movd m0, [r1+32*0] ; A
movd m1, [r1+32*2] ; C
@ -1046,8 +1052,8 @@ cglobal vp8_idct_dc_add4y_sse2, 3, 3, 6
; void vp8_idct_dc_add4uv_<opt>(uint8_t *dst, DCTELEM block[4][16], int stride);
;-----------------------------------------------------------------------------
INIT_MMX
cglobal vp8_idct_dc_add4uv_mmx, 3, 3
INIT_MMX mmx
cglobal vp8_idct_dc_add4uv, 3, 3
; load data
movd m0, [r1+32*0] ; A
movd m1, [r1+32*2] ; C
@ -1118,9 +1124,8 @@ cglobal vp8_idct_dc_add4uv_mmx, 3, 3
SWAP %4, %3
%endmacro
INIT_MMX
%macro VP8_IDCT_ADD 1
cglobal vp8_idct_add_%1, 3, 3
%macro VP8_IDCT_ADD 0
cglobal vp8_idct_add, 3, 3
; load block data
movq m0, [r1+ 0]
movq m1, [r1+ 8]
@ -1128,7 +1133,7 @@ cglobal vp8_idct_add_%1, 3, 3
movq m3, [r1+24]
movq m6, [pw_20091]
movq m7, [pw_17734]
%ifidn %1, sse
%if cpuflag(sse)
xorps xmm0, xmm0
movaps [r1+ 0], xmm0
movaps [r1+16], xmm0
@ -1157,9 +1162,11 @@ cglobal vp8_idct_add_%1, 3, 3
%endmacro
%if ARCH_X86_32
VP8_IDCT_ADD mmx
INIT_MMX mmx
VP8_IDCT_ADD
%endif
VP8_IDCT_ADD sse
INIT_MMX sse
VP8_IDCT_ADD
;-----------------------------------------------------------------------------
; void vp8_luma_dc_wht_mmxext(DCTELEM block[4][4][16], DCTELEM dc[16])
@ -1192,13 +1199,13 @@ VP8_IDCT_ADD sse
SWAP %1, %4, %3
%endmacro
%macro VP8_DC_WHT 1
cglobal vp8_luma_dc_wht_%1, 2,3
%macro VP8_DC_WHT 0
cglobal vp8_luma_dc_wht, 2, 3
movq m0, [r1]
movq m1, [r1+8]
movq m2, [r1+16]
movq m3, [r1+24]
%ifidn %1, sse
%if cpuflag(sse)
xorps xmm0, xmm0
movaps [r1+ 0], xmm0
movaps [r1+16], xmm0
@ -1222,11 +1229,12 @@ cglobal vp8_luma_dc_wht_%1, 2,3
RET
%endmacro
INIT_MMX
%if ARCH_X86_32
VP8_DC_WHT mmx
INIT_MMX mmx
VP8_DC_WHT
%endif
VP8_DC_WHT sse
INIT_MMX sse
VP8_DC_WHT
;-----------------------------------------------------------------------------
; void vp8_h/v_loop_filter_simple_<opt>(uint8_t *dst, int stride, int flim);

Loading…
Cancel
Save