VP8: 30% faster idct_mb

Take shortcuts based on statistically common situations.
Add 4-at-a-time idct_dc function (mmx and sse2) since rows of 4 DC-only DCT
blocks are common.
TODO: tie this more directly into the MB mode, since the DC-level transform is
only used for non-splitmv blocks?

Originally committed as revision 24452 to svn://svn.ffmpeg.org/ffmpeg/trunk
oldabi
Jason Garrett-Glaser 15 years ago
parent ef38842f0b
commit 8a467b2d44
  1. 64
      libavcodec/vp8.c
  2. 26
      libavcodec/vp8dsp.c
  3. 1
      libavcodec/vp8dsp.h
  4. 5
      libavcodec/x86/vp8dsp-init.c
  5. 181
      libavcodec/x86/vp8dsp.asm

@ -1186,45 +1186,49 @@ static void inter_predict(VP8Context *s, uint8_t *dst[3], VP8Macroblock *mb,
} }
} }
static void idct_mb(VP8Context *s, uint8_t *y_dst, uint8_t *u_dst, uint8_t *v_dst, static void idct_mb(VP8Context *s, uint8_t *dst[3], VP8Macroblock *mb)
VP8Macroblock *mb)
{ {
int x, y, nnz; int x, y, ch, nnz;
if (mb->mode != MODE_I4x4) if (mb->mode != MODE_I4x4) {
uint8_t *y_dst = dst[0];
for (y = 0; y < 4; y++) { for (y = 0; y < 4; y++) {
for (x = 0; x < 4; x++) { uint32_t nnz = AV_RN32A(s->non_zero_count_cache[y]);
nnz = s->non_zero_count_cache[y][x]; if (nnz) {
if (nnz) { if (nnz&~0x01010101) {
if (nnz == 1) for (x = 0; x < 4; x++) {
s->vp8dsp.vp8_idct_dc_add(y_dst+4*x, s->block[y][x], s->linesize); nnz = s->non_zero_count_cache[y][x];
else if (nnz) {
s->vp8dsp.vp8_idct_add(y_dst+4*x, s->block[y][x], s->linesize); if (nnz == 1)
s->vp8dsp.vp8_idct_dc_add(y_dst+4*x, s->block[y][x], s->linesize);
else
s->vp8dsp.vp8_idct_add(y_dst+4*x, s->block[y][x], s->linesize);
}
}
} else {
s->vp8dsp.vp8_idct_dc_add4(y_dst, s->block[y], s->linesize);
} }
} }
y_dst += 4*s->linesize; y_dst += 4*s->linesize;
} }
}
for (y = 0; y < 2; y++) { for (ch = 0; ch < 2; ch++) {
for (x = 0; x < 2; x++) { if (AV_RN32A(s->non_zero_count_cache[4+ch])) {
nnz = s->non_zero_count_cache[4][(y<<1)+x]; uint8_t *ch_dst = dst[1+ch];
if (nnz) { for (y = 0; y < 2; y++) {
if (nnz == 1) for (x = 0; x < 2; x++) {
s->vp8dsp.vp8_idct_dc_add(u_dst+4*x, s->block[4][(y<<1)+x], s->uvlinesize); nnz = s->non_zero_count_cache[4+ch][(y<<1)+x];
else if (nnz) {
s->vp8dsp.vp8_idct_add(u_dst+4*x, s->block[4][(y<<1)+x], s->uvlinesize); if (nnz == 1)
} s->vp8dsp.vp8_idct_dc_add(ch_dst+4*x, s->block[4+ch][(y<<1)+x], s->uvlinesize);
else
nnz = s->non_zero_count_cache[5][(y<<1)+x]; s->vp8dsp.vp8_idct_add(ch_dst+4*x, s->block[4+ch][(y<<1)+x], s->uvlinesize);
if (nnz) { }
if (nnz == 1) }
s->vp8dsp.vp8_idct_dc_add(v_dst+4*x, s->block[5][(y<<1)+x], s->uvlinesize); ch_dst += 4*s->uvlinesize;
else
s->vp8dsp.vp8_idct_add(v_dst+4*x, s->block[5][(y<<1)+x], s->uvlinesize);
} }
} }
u_dst += 4*s->uvlinesize;
v_dst += 4*s->uvlinesize;
} }
} }
@ -1511,7 +1515,7 @@ static int vp8_decode_frame(AVCodecContext *avctx, void *data, int *data_size,
prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_GOLDEN); prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_GOLDEN);
if (!mb->skip) { if (!mb->skip) {
idct_mb(s, dst[0], dst[1], dst[2], mb); idct_mb(s, dst, mb);
} else { } else {
AV_ZERO64(s->left_nnz); AV_ZERO64(s->left_nnz);
AV_WN64(s->top_nnz[mb_x], 0); // array of 9, so unaligned AV_WN64(s->top_nnz[mb_x], 0); // array of 9, so unaligned

@ -109,6 +109,25 @@ static void vp8_idct_dc_add_c(uint8_t *dst, DCTELEM block[16], int stride)
} }
} }
static void vp8_idct_dc_add4_c(uint8_t *dst, DCTELEM block[4][16], int stride)
{
int i, j;
for (j = 0; j < 4; j++) {
uint8_t *pix = dst+j*4;
int dc = (block[j][0] + 4) >> 3;
uint8_t *cm = ff_cropTbl + MAX_NEG_CROP + dc;
block[j][0] = 0;
if (!dc)
continue;
for (i = 0; i < 4; i++) {
pix[0] = cm[pix[0]];
pix[1] = cm[pix[1]];
pix[2] = cm[pix[2]];
pix[3] = cm[pix[3]];
pix += stride;
}
}
}
// because I like only having two parameters to pass functions... // because I like only having two parameters to pass functions...
#define LOAD_PIXELS\ #define LOAD_PIXELS\
@ -460,9 +479,10 @@ VP8_BILINEAR(4)
av_cold void ff_vp8dsp_init(VP8DSPContext *dsp) av_cold void ff_vp8dsp_init(VP8DSPContext *dsp)
{ {
dsp->vp8_luma_dc_wht = vp8_luma_dc_wht_c; dsp->vp8_luma_dc_wht = vp8_luma_dc_wht_c;
dsp->vp8_idct_add = vp8_idct_add_c; dsp->vp8_idct_add = vp8_idct_add_c;
dsp->vp8_idct_dc_add = vp8_idct_dc_add_c; dsp->vp8_idct_dc_add = vp8_idct_dc_add_c;
dsp->vp8_idct_dc_add4 = vp8_idct_dc_add4_c;
dsp->vp8_v_loop_filter16y = vp8_v_loop_filter16_c; dsp->vp8_v_loop_filter16y = vp8_v_loop_filter16_c;
dsp->vp8_h_loop_filter16y = vp8_h_loop_filter16_c; dsp->vp8_h_loop_filter16y = vp8_h_loop_filter16_c;

@ -33,6 +33,7 @@ typedef struct VP8DSPContext {
void (*vp8_luma_dc_wht)(DCTELEM block[4][4][16], DCTELEM dc[16]); void (*vp8_luma_dc_wht)(DCTELEM block[4][4][16], DCTELEM dc[16]);
void (*vp8_idct_add)(uint8_t *dst, DCTELEM block[16], int stride); void (*vp8_idct_add)(uint8_t *dst, DCTELEM block[16], int stride);
void (*vp8_idct_dc_add)(uint8_t *dst, DCTELEM block[16], int stride); void (*vp8_idct_dc_add)(uint8_t *dst, DCTELEM block[16], int stride);
void (*vp8_idct_dc_add4)(uint8_t *dst, DCTELEM block[4][16], int stride);
// loop filter applied to edges between macroblocks // loop filter applied to edges between macroblocks
void (*vp8_v_loop_filter16y)(uint8_t *dst, int stride, void (*vp8_v_loop_filter16y)(uint8_t *dst, int stride,

@ -220,6 +220,8 @@ HVBILIN(ssse3, 8, 16, 16)
extern void ff_vp8_idct_dc_add_mmx(uint8_t *dst, DCTELEM block[16], int stride); extern void ff_vp8_idct_dc_add_mmx(uint8_t *dst, DCTELEM block[16], int stride);
extern void ff_vp8_idct_dc_add_sse4(uint8_t *dst, DCTELEM block[16], int stride); extern void ff_vp8_idct_dc_add_sse4(uint8_t *dst, DCTELEM block[16], int stride);
extern void ff_vp8_idct_dc_add4_mmx(uint8_t *dst, DCTELEM block[4][16], int stride);
extern void ff_vp8_idct_dc_add4_sse2(uint8_t *dst, DCTELEM block[4][16], int stride);
extern void ff_vp8_luma_dc_wht_mmx(DCTELEM block[4][4][16], DCTELEM dc[16]); extern void ff_vp8_luma_dc_wht_mmx(DCTELEM block[4][4][16], DCTELEM dc[16]);
extern void ff_vp8_idct_add_mmx(uint8_t *dst, DCTELEM block[16], int stride); extern void ff_vp8_idct_add_mmx(uint8_t *dst, DCTELEM block[16], int stride);
extern void ff_vp8_idct_add_sse(uint8_t *dst, DCTELEM block[16], int stride); extern void ff_vp8_idct_add_sse(uint8_t *dst, DCTELEM block[16], int stride);
@ -283,6 +285,7 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c)
#if HAVE_YASM #if HAVE_YASM
if (mm_flags & FF_MM_MMX) { if (mm_flags & FF_MM_MMX) {
c->vp8_idct_dc_add = ff_vp8_idct_dc_add_mmx; c->vp8_idct_dc_add = ff_vp8_idct_dc_add_mmx;
c->vp8_idct_dc_add4 = ff_vp8_idct_dc_add4_mmx;
c->vp8_idct_add = ff_vp8_idct_add_mmx; c->vp8_idct_add = ff_vp8_idct_add_mmx;
c->vp8_luma_dc_wht = ff_vp8_luma_dc_wht_mmx; c->vp8_luma_dc_wht = ff_vp8_luma_dc_wht_mmx;
c->put_vp8_epel_pixels_tab[0][0][0] = c->put_vp8_epel_pixels_tab[0][0][0] =
@ -351,6 +354,8 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c)
} }
if (mm_flags & FF_MM_SSE2) { if (mm_flags & FF_MM_SSE2) {
c->vp8_idct_dc_add4 = ff_vp8_idct_dc_add4_sse2;
c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_sse2; c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_sse2;
c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_sse2; c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_sse2;

@ -900,75 +900,148 @@ cglobal put_vp8_pixels16_sse, 5,5,2
REP_RET REP_RET
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
; IDCT functions:
;
; void vp8_idct_dc_add_<opt>(uint8_t *dst, DCTELEM block[16], int stride); ; void vp8_idct_dc_add_<opt>(uint8_t *dst, DCTELEM block[16], int stride);
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
%macro ADD_DC 4
%4 m2, [r0+%3]
%4 m3, [r0+r2+%3]
%4 m4, [r1+%3]
%4 m5, [r1+r2+%3]
paddusb m2, %1
paddusb m3, %1
paddusb m4, %1
paddusb m5, %1
psubusb m2, %2
psubusb m3, %2
psubusb m4, %2
psubusb m5, %2
%4 [r0+%3], m2
%4 [r0+r2+%3], m3
%4 [r1+%3], m4
%4 [r1+r2+%3], m5
%endmacro
INIT_MMX
cglobal vp8_idct_dc_add_mmx, 3, 3 cglobal vp8_idct_dc_add_mmx, 3, 3
; load data ; load data
movd mm0, [r1] movd m0, [r1]
; calculate DC ; calculate DC
paddw mm0, [pw_4] paddw m0, [pw_4]
pxor mm1, mm1 pxor m1, m1
psraw mm0, 3 psraw m0, 3
movd [r1], mm1 movd [r1], m1
psubw mm1, mm0 psubw m1, m0
packuswb mm0, mm0 packuswb m0, m0
packuswb mm1, mm1 packuswb m1, m1
punpcklbw mm0, mm0 punpcklbw m0, m0
punpcklbw mm1, mm1 punpcklbw m1, m1
punpcklwd mm0, mm0 punpcklwd m0, m0
punpcklwd mm1, mm1 punpcklwd m1, m1
; add DC ; add DC
lea r1, [r0+r2*2] lea r1, [r0+r2*2]
movd mm2, [r0] ADD_DC m0, m1, 0, movh
movd mm3, [r0+r2]
movd mm4, [r1]
movd mm5, [r1+r2]
paddusb mm2, mm0
paddusb mm3, mm0
paddusb mm4, mm0
paddusb mm5, mm0
psubusb mm2, mm1
psubusb mm3, mm1
psubusb mm4, mm1
psubusb mm5, mm1
movd [r0], mm2
movd [r0+r2], mm3
movd [r1], mm4
movd [r1+r2], mm5
RET RET
INIT_XMM
cglobal vp8_idct_dc_add_sse4, 3, 3, 6 cglobal vp8_idct_dc_add_sse4, 3, 3, 6
; load data ; load data
movd xmm0, [r1] movd m0, [r1]
pxor xmm1, xmm1 pxor m1, m1
; calculate DC
paddw m0, [pw_4]
movd [r1], m1
lea r1, [r0+r2*2]
movd m2, [r0]
movd m3, [r0+r2]
movd m4, [r1]
movd m5, [r1+r2]
psraw m0, 3
pshuflw m0, m0, 0
punpcklqdq m0, m0
punpckldq m2, m3
punpckldq m4, m5
punpcklbw m2, m1
punpcklbw m4, m1
paddw m2, m0
paddw m4, m0
packuswb m2, m4
movd [r0], m2
pextrd [r0+r2], m2, 1
pextrd [r1], m2, 2
pextrd [r1+r2], m2, 3
RET
;-----------------------------------------------------------------------------
; void vp8_idct_dc_add4_<opt>(uint8_t *dst, DCTELEM block[4][16], int stride);
;-----------------------------------------------------------------------------
INIT_MMX
cglobal vp8_idct_dc_add4_mmx, 3, 3
; load data
movd m0, [r1+32*0] ; A
movd m1, [r1+32*2] ; C
punpcklwd m0, [r1+32*1] ; A B
punpcklwd m1, [r1+32*3] ; C D
punpckldq m0, m1 ; A B C D
pxor m6, m6
; calculate DC ; calculate DC
paddw xmm0, [pw_4] paddw m0, [pw_4]
movd [r1], xmm1 movd [r1+32*0], m6
lea r1, [r0+r2*2] movd [r1+32*1], m6
movd xmm2, [r0] movd [r1+32*2], m6
movd xmm3, [r0+r2] movd [r1+32*3], m6
movd xmm4, [r1] psraw m0, 3
movd xmm5, [r1+r2] psubw m6, m0
psraw xmm0, 3 packuswb m0, m0
pshuflw xmm0, xmm0, 0 packuswb m6, m6
punpcklqdq xmm0, xmm0 punpcklbw m0, m0 ; AABBCCDD
punpckldq xmm2, xmm3 punpcklbw m6, m6 ; AABBCCDD
punpckldq xmm4, xmm5 movq m1, m0
punpcklbw xmm2, xmm1 movq m7, m6
punpcklbw xmm4, xmm1 punpcklbw m0, m0 ; AAAABBBB
paddw xmm2, xmm0 punpckhbw m1, m1 ; CCCCDDDD
paddw xmm4, xmm0 punpcklbw m6, m6 ; AAAABBBB
packuswb xmm2, xmm4 punpckhbw m7, m7 ; CCCCDDDD
movd [r0], xmm2
pextrd [r0+r2], xmm2, 1 ; add DC
pextrd [r1], xmm2, 2 lea r1, [r0+r2*2]
pextrd [r1+r2], xmm2, 3 ADD_DC m0, m6, 0, mova
ADD_DC m1, m7, 8, mova
RET
INIT_XMM
cglobal vp8_idct_dc_add4_sse2, 3, 3
; load data
movd m0, [r1+32*0] ; A
movd m1, [r1+32*2] ; C
punpcklwd m0, [r1+32*1] ; A B
punpcklwd m1, [r1+32*3] ; C D
punpckldq m0, m1 ; A B C D
pxor m1, m1
; calculate DC
paddw m0, [pw_4]
movd [r1+32*0], m1
movd [r1+32*1], m1
movd [r1+32*2], m1
movd [r1+32*3], m1
psraw m0, 3
psubw m1, m0
packuswb m0, m0
packuswb m1, m1
punpcklbw m0, m0
punpcklbw m1, m1
punpcklbw m0, m0
punpcklbw m1, m1
; add DC
lea r1, [r0+r2*2]
ADD_DC m0, m1, 0, mova
RET RET
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------

Loading…
Cancel
Save