diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile index b4e3161db5..bbda8fadf0 100644 --- a/libavcodec/x86/Makefile +++ b/libavcodec/x86/Makefile @@ -9,7 +9,7 @@ YASM-OBJS-$(CONFIG_FFT) += x86/fft_mmx.o \ YASM-OBJS-$(CONFIG_GPL) += x86/h264_idct_sse2.o \ MMX-OBJS-$(CONFIG_H264DSP) += x86/h264dsp_mmx.o -YASM-OBJS-$(CONFIG_H264DSP) += x86/h264_deblock_sse2.o \ +YASM-OBJS-$(CONFIG_H264DSP) += x86/h264_deblock.o \ x86/h264_weight.o \ YASM-OBJS-$(CONFIG_H264PRED) += x86/h264_intrapred.o diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c index b2f389bb61..28c616651c 100644 --- a/libavcodec/x86/dsputil_mmx.c +++ b/libavcodec/x86/dsputil_mmx.c @@ -60,6 +60,7 @@ DECLARE_ALIGNED(8, const uint64_t, ff_pw_96 ) = 0x0060006000600060ULL; DECLARE_ALIGNED(8, const uint64_t, ff_pw_128) = 0x0080008000800080ULL; DECLARE_ALIGNED(8, const uint64_t, ff_pw_255) = 0x00ff00ff00ff00ffULL; +DECLARE_ALIGNED(16, const xmm_reg, ff_pb_0 ) = {0x0000000000000000ULL, 0x0000000000000000ULL}; DECLARE_ALIGNED(16, const xmm_reg, ff_pb_1 ) = {0x0101010101010101ULL, 0x0101010101010101ULL}; DECLARE_ALIGNED(16, const xmm_reg, ff_pb_3 ) = {0x0303030303030303ULL, 0x0303030303030303ULL}; DECLARE_ALIGNED(16, const xmm_reg, ff_pb_4 ) = {0x0404040404040404ULL, 0x0404040404040404ULL}; @@ -68,7 +69,7 @@ DECLARE_ALIGNED(8, const uint64_t, ff_pb_1F ) = 0x1F1F1F1F1F1F1F1FULL; DECLARE_ALIGNED(8, const uint64_t, ff_pb_3F ) = 0x3F3F3F3F3F3F3F3FULL; DECLARE_ALIGNED(16, const xmm_reg, ff_pb_80 ) = {0x8080808080808080ULL, 0x8080808080808080ULL}; DECLARE_ALIGNED(8, const uint64_t, ff_pb_81 ) = 0x8181818181818181ULL; -DECLARE_ALIGNED(8, const uint64_t, ff_pb_A1 ) = 0xA1A1A1A1A1A1A1A1ULL; +DECLARE_ALIGNED(16, const xmm_reg, ff_pb_A1 ) = {0xA1A1A1A1A1A1A1A1ULL, 0xA1A1A1A1A1A1A1A1ULL}; DECLARE_ALIGNED(16, const xmm_reg, ff_pb_F8 ) = {0xF8F8F8F8F8F8F8F8ULL, 0xF8F8F8F8F8F8F8F8ULL}; DECLARE_ALIGNED(8, const uint64_t, ff_pb_FC ) = 0xFCFCFCFCFCFCFCFCULL; DECLARE_ALIGNED(16, const xmm_reg, ff_pb_FE ) = {0xFEFEFEFEFEFEFEFEULL, 0xFEFEFEFEFEFEFEFEULL}; diff --git a/libavcodec/x86/dsputil_mmx.h b/libavcodec/x86/dsputil_mmx.h index 4a1023c8a7..2bd05cefbf 100644 --- a/libavcodec/x86/dsputil_mmx.h +++ b/libavcodec/x86/dsputil_mmx.h @@ -57,7 +57,7 @@ extern const uint64_t ff_pb_7; extern const uint64_t ff_pb_1F; extern const uint64_t ff_pb_3F; extern const uint64_t ff_pb_81; -extern const uint64_t ff_pb_A1; +extern const xmm_reg ff_pb_A1; extern const xmm_reg ff_pb_F8; extern const uint64_t ff_pb_FC; extern const xmm_reg ff_pb_FE; diff --git a/libavcodec/x86/h264_deblock_sse2.asm b/libavcodec/x86/h264_deblock.asm similarity index 78% rename from libavcodec/x86/h264_deblock_sse2.asm rename to libavcodec/x86/h264_deblock.asm index a9e6dea3d6..fb9cacfd11 100644 --- a/libavcodec/x86/h264_deblock_sse2.asm +++ b/libavcodec/x86/h264_deblock.asm @@ -4,6 +4,7 @@ ;* Copyright (C) 2005-2008 x264 project ;* ;* Authors: Loren Merritt +;* Jason Garrett-Glaser ;* ;* This file is part of FFmpeg. ;* @@ -23,12 +24,14 @@ ;****************************************************************************** %include "x86inc.asm" +%include "x86util.asm" SECTION_RODATA -pb_00: times 16 db 0x00 -pb_01: times 16 db 0x01 -pb_03: times 16 db 0x03 -pb_a1: times 16 db 0xa1 + +cextern pb_0 +cextern pb_1 +cextern pb_3 +cextern pb_A1 SECTION .text @@ -104,7 +107,7 @@ SECTION .text movd %8, m5 %endmacro -%macro SBUTTERFLY 4 +%macro SBUTTERFLY3 4 movq %4, %2 punpckl%1 %2, %3 punpckh%1 %4, %3 @@ -120,19 +123,19 @@ SECTION .text movq m4, %5 movq m5, %6 movq m6, %7 - SBUTTERFLY bw, m0, m1, m7 - SBUTTERFLY bw, m2, m3, m1 - SBUTTERFLY bw, m4, m5, m3 + SBUTTERFLY3 bw, m0, m1, m7 + SBUTTERFLY3 bw, m2, m3, m1 + SBUTTERFLY3 bw, m4, m5, m3 movq [%9+0x10], m1 - SBUTTERFLY bw, m6, %8, m5 - SBUTTERFLY wd, m0, m2, m1 - SBUTTERFLY wd, m4, m6, m2 + SBUTTERFLY3 bw, m6, %8, m5 + SBUTTERFLY3 wd, m0, m2, m1 + SBUTTERFLY3 wd, m4, m6, m2 punpckhdq m0, m4 movq [%9+0x00], m0 - SBUTTERFLY wd, m7, [%9+0x10], m6 - SBUTTERFLY wd, m3, m5, m4 - SBUTTERFLY dq, m7, m3, m0 - SBUTTERFLY dq, m1, m2, m5 + SBUTTERFLY3 wd, m7, [%9+0x10], m6 + SBUTTERFLY3 wd, m3, m5, m4 + SBUTTERFLY3 dq, m7, m3, m0 + SBUTTERFLY3 dq, m1, m2, m5 punpckldq m6, m4 movq [%9+0x10], m1 movq [%9+0x20], m5 @@ -151,25 +154,25 @@ SECTION .text movq m4, %5 movq m5, %6 movq m6, %7 - SBUTTERFLY bw, m0, m1, m7 - SBUTTERFLY bw, m2, m3, m1 - SBUTTERFLY bw, m4, m5, m3 - SBUTTERFLY bw, m6, %8, m5 + SBUTTERFLY3 bw, m0, m1, m7 + SBUTTERFLY3 bw, m2, m3, m1 + SBUTTERFLY3 bw, m4, m5, m3 + SBUTTERFLY3 bw, m6, %8, m5 movq %9, m3 - SBUTTERFLY wd, m0, m2, m3 - SBUTTERFLY wd, m4, m6, m2 - SBUTTERFLY wd, m7, m1, m6 + SBUTTERFLY3 wd, m0, m2, m3 + SBUTTERFLY3 wd, m4, m6, m2 + SBUTTERFLY3 wd, m7, m1, m6 movq %11, m2 movq m2, %9 - SBUTTERFLY wd, m2, m5, m1 - SBUTTERFLY dq, m0, m4, m5 - SBUTTERFLY dq, m7, m2, m4 + SBUTTERFLY3 wd, m2, m5, m1 + SBUTTERFLY3 dq, m0, m4, m5 + SBUTTERFLY3 dq, m7, m2, m4 movq %9, m0 movq %10, m5 movq %13, m7 movq %14, m4 - SBUTTERFLY dq, m3, %11, m0 - SBUTTERFLY dq, m6, m1, m5 + SBUTTERFLY3 dq, m3, %11, m0 + SBUTTERFLY3 dq, m6, m1, m5 movq %11, m3 movq %12, m0 movq %15, m6 @@ -235,19 +238,19 @@ SECTION .text ; clobbers: m0,3-6 %macro DEBLOCK_P0_Q0 0 mova m5, m1 - pxor m5, m2 ; p0^q0 - pand m5, [pb_01] ; (p0^q0)&1 + pxor m5, m2 ; p0^q0 + pand m5, [pb_1] ; (p0^q0)&1 pcmpeqb m4, m4 pxor m3, m4 - pavgb m3, m0 ; (p1 - q1 + 256)>>1 - pavgb m3, [pb_03] ; (((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2 + pavgb m3, m0 ; (p1 - q1 + 256)>>1 + pavgb m3, [pb_3] ; (((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2 pxor m4, m1 - pavgb m4, m2 ; (q0 - p0 + 256)>>1 + pavgb m4, m2 ; (q0 - p0 + 256)>>1 pavgb m3, m5 - paddusb m3, m4 ; d+128+33 - mova m6, [pb_a1] + paddusb m3, m4 ; d+128+33 + mova m6, [pb_A1] psubusb m6, m3 - psubusb m3, [pb_a1] + psubusb m3, [pb_A1] pminub m6, m7 pminub m3, m7 psubusb m1, m6 @@ -263,10 +266,10 @@ SECTION .text %macro LUMA_Q1 6 mova %6, m1 pavgb %6, m2 - pavgb %2, %6 ; avg(p2,avg(p0,q0)) + pavgb %2, %6 ; avg(p2,avg(p0,q0)) pxor %6, %3 - pand %6, [pb_01] ; (p2^avg(p0,q0))&1 - psubusb %2, %6 ; (p2+((p0+q0+1)>>1))>>1 + pand %6, [pb_1] ; (p2^avg(p0,q0))&1 + psubusb %2, %6 ; (p2+((p0+q0+1)>>1))>>1 mova %6, %1 psubusb %6, %5 paddusb %5, %1 @@ -495,6 +498,8 @@ cglobal x264_deblock_h_luma_%1, 0,5 RET %endmacro ; DEBLOCK_LUMA +INIT_MMX +DEBLOCK_LUMA mmxext, v8, 8 INIT_XMM DEBLOCK_LUMA sse2, v, 16 @@ -517,9 +522,9 @@ DEBLOCK_LUMA sse2, v, 16 mova t3, t2 mova t4, t2 psrlw t2, 1 - pavgb t2, mpb_00 + pavgb t2, mpb_0 pxor t2, t0 - pand t2, mpb_01 + pand t2, mpb_1 psubb t0, t2 ; p1' = (p2+p1+p0+q0+2)/4; mova t1, p2 @@ -528,21 +533,21 @@ DEBLOCK_LUMA sse2, v, 16 psubb t2, q1 paddb t3, t3 psubb t3, t2 ; p2+2*p1+2*p0+2*q0+q1 - pand t2, mpb_01 + pand t2, mpb_1 psubb t1, t2 pavgb t1, p1 pavgb t1, t5 ; (((p2+q1)/2 + p1+1)/2 + (p0+q0+1)/2 + 1)/2 psrlw t3, 2 - pavgb t3, mpb_00 + pavgb t3, mpb_0 pxor t3, t1 - pand t3, mpb_01 + pand t3, mpb_1 psubb t1, t3 ; p0'a = (p2+2*p1+2*p0+2*q0+q1+4)/8 mova t3, p0 mova t2, p0 pxor t3, q1 pavgb t2, q1 - pand t3, mpb_01 + pand t3, mpb_1 psubb t2, t3 pavgb t2, p1 ; p0'b = (2*p1+p0+q0+2)/4 @@ -562,9 +567,9 @@ DEBLOCK_LUMA sse2, v, 16 paddb t2, t2 paddb t2, t4 ; 2*p3+3*p2+p1+p0+q0 psrlw t2, 2 - pavgb t2, mpb_00 + pavgb t2, mpb_0 pxor t2, t1 - pand t2, mpb_01 + pand t2, mpb_1 psubb t1, t2 ; p2' = (2*p3+3*p2+p1+p0+q0+4)/8 pxor t0, p1 @@ -603,8 +608,8 @@ DEBLOCK_LUMA sse2, v, 16 %define mask0 m12 %define mask1p m13 %define mask1q [rsp-24] - %define mpb_00 m14 - %define mpb_01 m15 + %define mpb_0 m14 + %define mpb_1 m15 %else %define spill(x) [esp+16*x+((stack_offset+4)&15)] %define p2 [r4+r1] @@ -614,8 +619,8 @@ DEBLOCK_LUMA sse2, v, 16 %define mask0 spill(2) %define mask1p spill(3) %define mask1q spill(4) - %define mpb_00 [pb_00] - %define mpb_01 [pb_01] + %define mpb_0 [pb_0] + %define mpb_1 [pb_1] %endif ;----------------------------------------------------------------------------- @@ -638,12 +643,12 @@ cglobal x264_deblock_%2_luma_intra_%1, 4,6,16 mova q0, [r0] mova q1, [r0+r1] %ifdef ARCH_X86_64 - pxor mpb_00, mpb_00 - mova mpb_01, [pb_01] + pxor mpb_0, mpb_0 + mova mpb_1, [pb_1] LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0 SWAP 7, 12 ; m12=mask0 - pavgb t5, mpb_00 - pavgb t5, mpb_01 ; alpha/4+1 + pavgb t5, mpb_0 + pavgb t5, mpb_1 ; alpha/4+1 movdqa p2, [r4+r1] movdqa q2, [r0+2*r1] DIFF_GT2 p0, q0, t5, t0, t3 ; t0 = |p0-q0| > alpha/4+1 @@ -658,8 +663,8 @@ cglobal x264_deblock_%2_luma_intra_%1, 4,6,16 LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0 mova m4, t5 mova mask0, m7 - pavgb m4, [pb_00] - pavgb m4, [pb_01] ; alpha/4+1 + pavgb m4, [pb_0] + pavgb m4, [pb_1] ; alpha/4+1 DIFF_GT2 p0, q0, m4, m6, m7 ; m6 = |p0-q0| > alpha/4+1 pand m6, mask0 DIFF_GT2 p0, p2, m5, m4, m7 ; m4 = |p2-p0| > beta-1 @@ -759,3 +764,126 @@ DEBLOCK_LUMA_INTRA sse2, v INIT_MMX DEBLOCK_LUMA_INTRA mmxext, v8 %endif + + + +INIT_MMX + +%macro CHROMA_V_START 0 + dec r2d ; alpha-1 + dec r3d ; beta-1 + mov t5, r0 + sub t5, r1 + sub t5, r1 +%endmacro + +%macro CHROMA_H_START 0 + dec r2d + dec r3d + sub r0, 2 + lea t6, [r1*3] + mov t5, r0 + add r0, t6 +%endmacro + +%define t5 r5 +%define t6 r6 + +;----------------------------------------------------------------------------- +; void x264_deblock_v_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) +;----------------------------------------------------------------------------- +cglobal x264_deblock_v_chroma_mmxext, 5,6 + CHROMA_V_START + movq m0, [t5] + movq m1, [t5+r1] + movq m2, [r0] + movq m3, [r0+r1] + call x264_chroma_inter_body_mmxext + movq [t5+r1], m1 + movq [r0], m2 + RET + +;----------------------------------------------------------------------------- +; void x264_deblock_h_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) +;----------------------------------------------------------------------------- +cglobal x264_deblock_h_chroma_mmxext, 5,7 +%ifdef ARCH_X86_64 + %define buf0 [rsp-24] + %define buf1 [rsp-16] +%else + %define buf0 r0m + %define buf1 r2m +%endif + CHROMA_H_START + TRANSPOSE4x8_LOAD PASS8ROWS(t5, r0, r1, t6) + movq buf0, m0 + movq buf1, m3 + call x264_chroma_inter_body_mmxext + movq m0, buf0 + movq m3, buf1 + TRANSPOSE8x4_STORE PASS8ROWS(t5, r0, r1, t6) + RET + +ALIGN 16 +x264_chroma_inter_body_mmxext: + LOAD_MASK r2d, r3d + movd m6, [r4] ; tc0 + punpcklbw m6, m6 + pand m7, m6 + DEBLOCK_P0_Q0 + ret + + + +; in: %1=p0 %2=p1 %3=q1 +; out: p0 = (p0 + q1 + 2*p1 + 2) >> 2 +%macro CHROMA_INTRA_P0 3 + movq m4, %1 + pxor m4, %3 + pand m4, [pb_1] ; m4 = (p0^q1)&1 + pavgb %1, %3 + psubusb %1, m4 + pavgb %1, %2 ; dst = avg(p1, avg(p0,q1) - ((p0^q1)&1)) +%endmacro + +%define t5 r4 +%define t6 r5 + +;----------------------------------------------------------------------------- +; void x264_deblock_v_chroma_intra( uint8_t *pix, int stride, int alpha, int beta ) +;----------------------------------------------------------------------------- +cglobal x264_deblock_v_chroma_intra_mmxext, 4,5 + CHROMA_V_START + movq m0, [t5] + movq m1, [t5+r1] + movq m2, [r0] + movq m3, [r0+r1] + call x264_chroma_intra_body_mmxext + movq [t5+r1], m1 + movq [r0], m2 + RET + +;----------------------------------------------------------------------------- +; void x264_deblock_h_chroma_intra( uint8_t *pix, int stride, int alpha, int beta ) +;----------------------------------------------------------------------------- +cglobal x264_deblock_h_chroma_intra_mmxext, 4,6 + CHROMA_H_START + TRANSPOSE4x8_LOAD PASS8ROWS(t5, r0, r1, t6) + call x264_chroma_intra_body_mmxext + TRANSPOSE8x4_STORE PASS8ROWS(t5, r0, r1, t6) + RET + +ALIGN 16 +x264_chroma_intra_body_mmxext: + LOAD_MASK r2d, r3d + movq m5, m1 + movq m6, m2 + CHROMA_INTRA_P0 m1, m0, m3 + CHROMA_INTRA_P0 m2, m3, m0 + psubb m1, m5 + psubb m2, m6 + pand m1, m7 + pand m2, m7 + paddb m1, m5 + paddb m2, m6 + ret diff --git a/libavcodec/x86/h264dsp_mmx.c b/libavcodec/x86/h264dsp_mmx.c index 65f89c6b9a..9bdde84b15 100644 --- a/libavcodec/x86/h264dsp_mmx.c +++ b/libavcodec/x86/h264dsp_mmx.c @@ -549,251 +549,6 @@ static void ff_h264_idct_add8_sse2(uint8_t **dest, const int *block_offset, DCTE /***********************************/ /* deblocking */ -// out: o = |x-y|>a -// clobbers: t -#define DIFF_GT_MMX(x,y,a,o,t)\ - "movq "#y", "#t" \n\t"\ - "movq "#x", "#o" \n\t"\ - "psubusb "#x", "#t" \n\t"\ - "psubusb "#y", "#o" \n\t"\ - "por "#t", "#o" \n\t"\ - "psubusb "#a", "#o" \n\t" - -// out: o = |x-y|>a -// clobbers: t -#define DIFF_GT2_MMX(x,y,a,o,t)\ - "movq "#y", "#t" \n\t"\ - "movq "#x", "#o" \n\t"\ - "psubusb "#x", "#t" \n\t"\ - "psubusb "#y", "#o" \n\t"\ - "psubusb "#a", "#t" \n\t"\ - "psubusb "#a", "#o" \n\t"\ - "pcmpeqb "#t", "#o" \n\t"\ - -// in: mm0=p1 mm1=p0 mm2=q0 mm3=q1 -// out: mm5=beta-1, mm7=mask -// clobbers: mm4,mm6 -#define H264_DEBLOCK_MASK(alpha1, beta1) \ - "pshufw $0, "#alpha1", %%mm4 \n\t"\ - "pshufw $0, "#beta1 ", %%mm5 \n\t"\ - "packuswb %%mm4, %%mm4 \n\t"\ - "packuswb %%mm5, %%mm5 \n\t"\ - DIFF_GT_MMX(%%mm1, %%mm2, %%mm4, %%mm7, %%mm6) /* |p0-q0| > alpha-1 */\ - DIFF_GT_MMX(%%mm0, %%mm1, %%mm5, %%mm4, %%mm6) /* |p1-p0| > beta-1 */\ - "por %%mm4, %%mm7 \n\t"\ - DIFF_GT_MMX(%%mm3, %%mm2, %%mm5, %%mm4, %%mm6) /* |q1-q0| > beta-1 */\ - "por %%mm4, %%mm7 \n\t"\ - "pxor %%mm6, %%mm6 \n\t"\ - "pcmpeqb %%mm6, %%mm7 \n\t" - -// in: mm0=p1 mm1=p0 mm2=q0 mm3=q1 mm7=(tc&mask) -// out: mm1=p0' mm2=q0' -// clobbers: mm0,3-6 -#define H264_DEBLOCK_P0_Q0(pb_01, pb_3f)\ - "movq %%mm1 , %%mm5 \n\t"\ - "pxor %%mm2 , %%mm5 \n\t" /* p0^q0*/\ - "pand "#pb_01" , %%mm5 \n\t" /* (p0^q0)&1*/\ - "pcmpeqb %%mm4 , %%mm4 \n\t"\ - "pxor %%mm4 , %%mm3 \n\t"\ - "pavgb %%mm0 , %%mm3 \n\t" /* (p1 - q1 + 256)>>1*/\ - "pavgb "MANGLE(ff_pb_3)" , %%mm3 \n\t" /*(((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2*/\ - "pxor %%mm1 , %%mm4 \n\t"\ - "pavgb %%mm2 , %%mm4 \n\t" /* (q0 - p0 + 256)>>1*/\ - "pavgb %%mm5 , %%mm3 \n\t"\ - "paddusb %%mm4 , %%mm3 \n\t" /* d+128+33*/\ - "movq "MANGLE(ff_pb_A1)" , %%mm6 \n\t"\ - "psubusb %%mm3 , %%mm6 \n\t"\ - "psubusb "MANGLE(ff_pb_A1)" , %%mm3 \n\t"\ - "pminub %%mm7 , %%mm6 \n\t"\ - "pminub %%mm7 , %%mm3 \n\t"\ - "psubusb %%mm6 , %%mm1 \n\t"\ - "psubusb %%mm3 , %%mm2 \n\t"\ - "paddusb %%mm3 , %%mm1 \n\t"\ - "paddusb %%mm6 , %%mm2 \n\t" - -// in: mm0=p1 mm1=p0 mm2=q0 mm3=q1 mm7=(tc&mask) %8=ff_bone -// out: (q1addr) = av_clip( (q2+((p0+q0+1)>>1))>>1, q1-tc0, q1+tc0 ) -// clobbers: q2, tmp, tc0 -#define H264_DEBLOCK_Q1(p1, q2, q2addr, q1addr, tc0, tmp)\ - "movq %%mm1, "#tmp" \n\t"\ - "pavgb %%mm2, "#tmp" \n\t"\ - "pavgb "#tmp", "#q2" \n\t" /* avg(p2,avg(p0,q0)) */\ - "pxor "q2addr", "#tmp" \n\t"\ - "pand %9, "#tmp" \n\t" /* (p2^avg(p0,q0))&1 */\ - "psubusb "#tmp", "#q2" \n\t" /* (p2+((p0+q0+1)>>1))>>1 */\ - "movq "#p1", "#tmp" \n\t"\ - "psubusb "#tc0", "#tmp" \n\t"\ - "paddusb "#p1", "#tc0" \n\t"\ - "pmaxub "#tmp", "#q2" \n\t"\ - "pminub "#tc0", "#q2" \n\t"\ - "movq "#q2", "q1addr" \n\t" - -static inline void h264_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha1, int beta1, int8_t *tc0) -{ - DECLARE_ALIGNED(8, uint64_t, tmp0)[2]; - - __asm__ volatile( - "movq (%2,%4), %%mm0 \n\t" //p1 - "movq (%2,%4,2), %%mm1 \n\t" //p0 - "movq (%3), %%mm2 \n\t" //q0 - "movq (%3,%4), %%mm3 \n\t" //q1 - H264_DEBLOCK_MASK(%7, %8) - - "movd %6, %%mm4 \n\t" - "punpcklbw %%mm4, %%mm4 \n\t" - "punpcklwd %%mm4, %%mm4 \n\t" - "pcmpeqb %%mm3, %%mm3 \n\t" - "movq %%mm4, %%mm6 \n\t" - "pcmpgtb %%mm3, %%mm4 \n\t" - "movq %%mm6, %1 \n\t" - "pand %%mm4, %%mm7 \n\t" - "movq %%mm7, %0 \n\t" - - /* filter p1 */ - "movq (%2), %%mm3 \n\t" //p2 - DIFF_GT2_MMX(%%mm1, %%mm3, %%mm5, %%mm6, %%mm4) // |p2-p0|>beta-1 - "pand %%mm7, %%mm6 \n\t" // mask & |p2-p0|beta-1 - "pand %0, %%mm6 \n\t" - "movq %1, %%mm5 \n\t" // can be merged with the and below but is slower then - "pand %%mm6, %%mm5 \n\t" - "psubb %%mm6, %%mm7 \n\t" - "movq (%3,%4), %%mm3 \n\t" - H264_DEBLOCK_Q1(%%mm3, %%mm4, "(%3,%4,2)", "(%3,%4)", %%mm5, %%mm6) - - /* filter p0, q0 */ - H264_DEBLOCK_P0_Q0(%9, unused) - "movq %%mm1, (%2,%4,2) \n\t" - "movq %%mm2, (%3) \n\t" - - : "=m"(tmp0[0]), "=m"(tmp0[1]) - : "r"(pix-3*stride), "r"(pix), "r"((x86_reg)stride), - "m"(*tmp0/*unused*/), "m"(*(uint32_t*)tc0), "m"(alpha1), "m"(beta1), - "m"(ff_bone) - ); -} - -static void h264_v_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) -{ - if((tc0[0] & tc0[1]) >= 0) - h264_loop_filter_luma_mmx2(pix, stride, alpha-1, beta-1, tc0); - if((tc0[2] & tc0[3]) >= 0) - h264_loop_filter_luma_mmx2(pix+8, stride, alpha-1, beta-1, tc0+2); -} -static void h264_h_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) -{ - //FIXME: could cut some load/stores by merging transpose with filter - // also, it only needs to transpose 6x8 - DECLARE_ALIGNED(8, uint8_t, trans)[8*8]; - int i; - for(i=0; i<2; i++, pix+=8*stride, tc0+=2) { - if((tc0[0] & tc0[1]) < 0) - continue; - transpose4x4(trans, pix-4, 8, stride); - transpose4x4(trans +4*8, pix, 8, stride); - transpose4x4(trans+4, pix-4+4*stride, 8, stride); - transpose4x4(trans+4+4*8, pix +4*stride, 8, stride); - h264_loop_filter_luma_mmx2(trans+4*8, 8, alpha-1, beta-1, tc0); - transpose4x4(pix-2, trans +2*8, stride, 8); - transpose4x4(pix-2+4*stride, trans+4+2*8, stride, 8); - } -} - -static inline void h264_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha1, int beta1, int8_t *tc0) -{ - __asm__ volatile( - "movq (%0), %%mm0 \n\t" //p1 - "movq (%0,%2), %%mm1 \n\t" //p0 - "movq (%1), %%mm2 \n\t" //q0 - "movq (%1,%2), %%mm3 \n\t" //q1 - H264_DEBLOCK_MASK(%4, %5) - "movd %3, %%mm6 \n\t" - "punpcklbw %%mm6, %%mm6 \n\t" - "pand %%mm6, %%mm7 \n\t" // mm7 = tc&mask - H264_DEBLOCK_P0_Q0(%6, %7) - "movq %%mm1, (%0,%2) \n\t" - "movq %%mm2, (%1) \n\t" - - :: "r"(pix-2*stride), "r"(pix), "r"((x86_reg)stride), - "r"(*(uint32_t*)tc0), - "m"(alpha1), "m"(beta1), "m"(ff_bone), "m"(ff_pb_3F) - ); -} - -static void h264_v_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) -{ - h264_loop_filter_chroma_mmx2(pix, stride, alpha-1, beta-1, tc0); -} - -static void h264_h_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) -{ - //FIXME: could cut some load/stores by merging transpose with filter - DECLARE_ALIGNED(8, uint8_t, trans)[8*4]; - transpose4x4(trans, pix-2, 8, stride); - transpose4x4(trans+4, pix-2+4*stride, 8, stride); - h264_loop_filter_chroma_mmx2(trans+2*8, 8, alpha-1, beta-1, tc0); - transpose4x4(pix-2, trans, stride, 8); - transpose4x4(pix-2+4*stride, trans+4, stride, 8); -} - -// p0 = (p0 + q1 + 2*p1 + 2) >> 2 -#define H264_FILTER_CHROMA4(p0, p1, q1, one) \ - "movq "#p0", %%mm4 \n\t"\ - "pxor "#q1", %%mm4 \n\t"\ - "pand "#one", %%mm4 \n\t" /* mm4 = (p0^q1)&1 */\ - "pavgb "#q1", "#p0" \n\t"\ - "psubusb %%mm4, "#p0" \n\t"\ - "pavgb "#p1", "#p0" \n\t" /* dst = avg(p1, avg(p0,q1) - ((p0^q1)&1)) */\ - -static inline void h264_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int alpha1, int beta1) -{ - __asm__ volatile( - "movq (%0), %%mm0 \n\t" - "movq (%0,%2), %%mm1 \n\t" - "movq (%1), %%mm2 \n\t" - "movq (%1,%2), %%mm3 \n\t" - H264_DEBLOCK_MASK(%3, %4) - "movq %%mm1, %%mm5 \n\t" - "movq %%mm2, %%mm6 \n\t" - H264_FILTER_CHROMA4(%%mm1, %%mm0, %%mm3, %5) //p0' - H264_FILTER_CHROMA4(%%mm2, %%mm3, %%mm0, %5) //q0' - "psubb %%mm5, %%mm1 \n\t" - "psubb %%mm6, %%mm2 \n\t" - "pand %%mm7, %%mm1 \n\t" - "pand %%mm7, %%mm2 \n\t" - "paddb %%mm5, %%mm1 \n\t" - "paddb %%mm6, %%mm2 \n\t" - "movq %%mm1, (%0,%2) \n\t" - "movq %%mm2, (%1) \n\t" - :: "r"(pix-2*stride), "r"(pix), "r"((x86_reg)stride), - "m"(alpha1), "m"(beta1), "m"(ff_bone) - ); -} - -static void h264_v_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int alpha, int beta) -{ - h264_loop_filter_chroma_intra_mmx2(pix, stride, alpha-1, beta-1); -} - -static void h264_h_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int alpha, int beta) -{ - //FIXME: could cut some load/stores by merging transpose with filter - DECLARE_ALIGNED(8, uint8_t, trans)[8*4]; - transpose4x4(trans, pix-2, 8, stride); - transpose4x4(trans+4, pix-2+4*stride, 8, stride); - h264_loop_filter_chroma_intra_mmx2(trans+2*8, 8, alpha-1, beta-1); - transpose4x4(pix-2, trans, stride, 8); - transpose4x4(pix-2+4*stride, trans+4, stride, 8); -} - static void h264_loop_filter_strength_mmx2( int16_t bS[2][4][4], uint8_t nnz[40], int8_t ref[2][40], int16_t mv[2][40][2], int bidir, int edges, int step, int mask_mv0, int mask_mv1, int field ) { int dir; @@ -918,6 +673,42 @@ static void h264_loop_filter_strength_mmx2( int16_t bS[2][4][4], uint8_t nnz[40] ); } +#define LF_FUNC(DIR, TYPE, OPT) \ +void ff_x264_deblock_ ## DIR ## _ ## TYPE ## _ ## OPT (uint8_t *pix, int stride, \ + int alpha, int beta, int8_t *tc0); +#define LF_IFUNC(DIR, TYPE, OPT) \ +void ff_x264_deblock_ ## DIR ## _ ## TYPE ## _ ## OPT (uint8_t *pix, int stride, \ + int alpha, int beta); + +LF_FUNC (h, chroma, mmxext) +LF_IFUNC(h, chroma_intra, mmxext) +LF_FUNC (v, chroma, mmxext) +LF_IFUNC(v, chroma_intra, mmxext) + +LF_FUNC (h, luma, mmxext) +LF_IFUNC(h, luma_intra, mmxext) +#if HAVE_YASM && ARCH_X86_32 +LF_FUNC (v8, luma, mmxext) +static void ff_x264_deblock_v_luma_mmxext(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) +{ + if((tc0[0] & tc0[1]) >= 0) + ff_x264_deblock_v8_luma_mmxext(pix+0, stride, alpha, beta, tc0); + if((tc0[2] & tc0[3]) >= 0) + ff_x264_deblock_v8_luma_mmxext(pix+8, stride, alpha, beta, tc0+2); +} +LF_IFUNC(v8, luma_intra, mmxext) +static void ff_x264_deblock_v_luma_intra_mmxext(uint8_t *pix, int stride, int alpha, int beta) +{ + ff_x264_deblock_v8_luma_intra_mmxext(pix+0, stride, alpha, beta); + ff_x264_deblock_v8_luma_intra_mmxext(pix+8, stride, alpha, beta); +} +#endif + +LF_FUNC (h, luma, sse2) +LF_IFUNC(h, luma_intra, sse2) +LF_FUNC (v, luma, sse2) +LF_IFUNC(v, luma_intra, sse2) + /***********************************/ /* weighted prediction */ @@ -949,21 +740,6 @@ H264_BIWEIGHT_MMX ( 4, 8) H264_BIWEIGHT_MMX ( 4, 4) H264_BIWEIGHT_MMX ( 4, 2) -void ff_x264_deblock_v_luma_sse2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0); -void ff_x264_deblock_h_luma_sse2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0); -void ff_x264_deblock_h_luma_intra_mmxext(uint8_t *pix, int stride, int alpha, int beta); -void ff_x264_deblock_v_luma_intra_sse2(uint8_t *pix, int stride, int alpha, int beta); -void ff_x264_deblock_h_luma_intra_sse2(uint8_t *pix, int stride, int alpha, int beta); - -#if HAVE_YASM && ARCH_X86_32 -void ff_x264_deblock_v8_luma_intra_mmxext(uint8_t *pix, int stride, int alpha, int beta); -static void ff_x264_deblock_v_luma_intra_mmxext(uint8_t *pix, int stride, int alpha, int beta) -{ - ff_x264_deblock_v8_luma_intra_mmxext(pix+0, stride, alpha, beta); - ff_x264_deblock_v8_luma_intra_mmxext(pix+8, stride, alpha, beta); -} -#endif - void ff_h264dsp_init_x86(H264DSPContext *c) { int mm_flags = mm_support(); @@ -987,12 +763,6 @@ void ff_h264dsp_init_x86(H264DSPContext *c) c->h264_idct_add8 = ff_h264_idct_add8_mmx2; c->h264_idct_add16intra= ff_h264_idct_add16intra_mmx2; - c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_mmx2; - c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_mmx2; - c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_mmx2; - c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_mmx2; - c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_mmx2; - c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_mmx2; c->h264_loop_filter_strength= h264_loop_filter_strength_mmx2; } if(mm_flags & FF_MM_SSE2){ @@ -1002,7 +772,13 @@ void ff_h264dsp_init_x86(H264DSPContext *c) #if HAVE_YASM if (mm_flags & FF_MM_MMX2){ + c->h264_v_loop_filter_chroma= ff_x264_deblock_v_chroma_mmxext; + c->h264_h_loop_filter_chroma= ff_x264_deblock_h_chroma_mmxext; + c->h264_v_loop_filter_chroma_intra= ff_x264_deblock_v_chroma_intra_mmxext; + c->h264_h_loop_filter_chroma_intra= ff_x264_deblock_h_chroma_intra_mmxext; #if ARCH_X86_32 + c->h264_v_loop_filter_luma= ff_x264_deblock_v_luma_mmxext; + c->h264_h_loop_filter_luma= ff_x264_deblock_h_luma_mmxext; c->h264_v_loop_filter_luma_intra = ff_x264_deblock_v_luma_intra_mmxext; c->h264_h_loop_filter_luma_intra = ff_x264_deblock_h_luma_intra_mmxext; #endif