VC1: merge idct8x8, coeff adjustments and put_pixels.

Merging these functions allows merging some loops, which makes the
results (particularly after SIMD optimizations) much faster.
oldabi
Ronald S. Bultje 14 years ago
parent 8d9ac969cb
commit f8bed30d8b
  1. 64
      libavcodec/ppc/vc1dsp_altivec.c
  2. 28
      libavcodec/vc1.c
  3. 58
      libavcodec/vc1dec.c
  4. 54
      libavcodec/vc1dsp.c
  5. 6
      libavcodec/vc1dsp.h

@ -130,7 +130,8 @@ do { \
/** Do inverse transform on 8x8 block
*/
static void vc1_inv_trans_8x8_altivec(DCTELEM block[64])
static void vc1_inv_trans_8x8_altivec(DCTELEM block[64],
int sign, int rangered)
{
vector signed short src0, src1, src2, src3, src4, src5, src6, src7;
vector signed int s0, s1, s2, s3, s4, s5, s6, s7;
@ -144,7 +145,9 @@ static void vc1_inv_trans_8x8_altivec(DCTELEM block[64])
const vector unsigned int vec_2 = vec_splat_u32(2);
const vector signed int vec_1s = vec_splat_s32(1);
const vector unsigned int vec_1 = vec_splat_u32(1);
const vector unsigned short rangered_shift = vec_splat_u16(1);
const vector signed short signed_bias = vec_sl(vec_splat_u16(4),
vec_splat_u16(4));
src0 = vec_ld( 0, block);
src1 = vec_ld( 16, block);
@ -214,6 +217,27 @@ static void vc1_inv_trans_8x8_altivec(DCTELEM block[64])
src6 = vec_pack(sE, s6);
src7 = vec_pack(sF, s7);
if (rangered) {
if (!sign) {
vec_sub(src0, signed_bias);
vec_sub(src1, signed_bias);
vec_sub(src2, signed_bias);
vec_sub(src3, signed_bias);
vec_sub(src4, signed_bias);
vec_sub(src5, signed_bias);
vec_sub(src6, signed_bias);
vec_sub(src7, signed_bias);
}
vec_sl(src0, rangered_shift);
vec_sl(src1, rangered_shift);
vec_sl(src2, rangered_shift);
vec_sl(src3, rangered_shift);
vec_sl(src4, rangered_shift);
vec_sl(src5, rangered_shift);
vec_sl(src6, rangered_shift);
vec_sl(src7, rangered_shift);
}
vec_st(src0, 0, block);
vec_st(src1, 16, block);
vec_st(src2, 32, block);
@ -224,6 +248,36 @@ static void vc1_inv_trans_8x8_altivec(DCTELEM block[64])
vec_st(src7,112, block);
}
static void vc1_inv_trans_8x8_add_altivec(uint8_t *dest, int stride, DCTELEM *b)
{
vc1_inv_trans_8x8_altivec(b, 0, 0);
ff_add_pixels_clamped_c(b, dest, stride);
}
static void vc1_inv_trans_8x8_put_signed_altivec(uint8_t *dest, int stride, DCTELEM *b)
{
vc1_inv_trans_8x8_altivec(b, 1, 0);
ff_put_signed_pixels_clamped_c(b, dest, stride);
}
static void vc1_inv_trans_8x8_put_signed_rangered_altivec(uint8_t *dest, int stride, DCTELEM *b)
{
vc1_inv_trans_8x8_altivec(b, 1, 1);
ff_put_signed_pixels_clamped_c(b, dest, stride);
}
static void vc1_inv_trans_8x8_put_altivec(uint8_t *dest, int stride, DCTELEM *b)
{
vc1_inv_trans_8x8_altivec(b, 0, 0);
ff_put_pixels_clamped_c(b, dest, stride);
}
static void vc1_inv_trans_8x8_put_rangered_altivec(uint8_t *dest, int stride, DCTELEM *b)
{
vc1_inv_trans_8x8_altivec(b, 0, 1);
ff_put_pixels_clamped_c(b, dest, stride);
}
/** Do inverse transform on 8x4 part of block
*/
static void vc1_inv_trans_8x4_altivec(uint8_t *dest, int stride, DCTELEM *block)
@ -342,7 +396,11 @@ void ff_vc1dsp_init_altivec(VC1DSPContext* dsp)
if (!(av_get_cpu_flags() & AV_CPU_FLAG_ALTIVEC))
return;
dsp->vc1_inv_trans_8x8 = vc1_inv_trans_8x8_altivec;
dsp->vc1_inv_trans_8x8_add = vc1_inv_trans_8x8_add_altivec;
dsp->vc1_inv_trans_8x8_put_signed[0] = vc1_inv_trans_8x8_put_signed_altivec;
dsp->vc1_inv_trans_8x8_put_signed[1] = vc1_inv_trans_8x8_put_signed_rangered_altivec;
dsp->vc1_inv_trans_8x8_put[0] = vc1_inv_trans_8x8_put_altivec;
dsp->vc1_inv_trans_8x8_put[1] = vc1_inv_trans_8x8_put_rangered_altivec;
dsp->vc1_inv_trans_8x4 = vc1_inv_trans_8x4_altivec;
dsp->put_no_rnd_vc1_chroma_pixels_tab[0] = put_no_rnd_vc1_chroma_mc8_altivec;
dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = avg_no_rnd_vc1_chroma_mc8_altivec;

@ -280,6 +280,28 @@ static int vop_dquant_decoding(VC1Context *v)
static int decode_sequence_header_adv(VC1Context *v, GetBitContext *gb);
static void simple_idct_put_rangered(uint8_t *dest, int line_size, DCTELEM *block)
{
int i;
ff_simple_idct(block);
for (i = 0; i < 64; i++) block[i] = (block[i] - 64) << 1;
ff_put_pixels_clamped_c(block, dest, line_size);
}
static void simple_idct_put_signed(uint8_t *dest, int line_size, DCTELEM *block)
{
ff_simple_idct(block);
ff_put_signed_pixels_clamped_c(block, dest, line_size);
}
static void simple_idct_put_signed_rangered(uint8_t *dest, int line_size, DCTELEM *block)
{
int i;
ff_simple_idct(block);
for (i = 0; i < 64; i++) block[i] <<= 1;
ff_put_signed_pixels_clamped_c(block, dest, line_size);
}
/**
* Decode Simple/Main Profiles sequence header
* @see Figure 7-8, p16-17
@ -337,7 +359,11 @@ int vc1_decode_sequence_header(AVCodecContext *avctx, VC1Context *v, GetBitConte
v->res_fasttx = get_bits1(gb);
if (!v->res_fasttx)
{
v->vc1dsp.vc1_inv_trans_8x8 = ff_simple_idct;
v->vc1dsp.vc1_inv_trans_8x8_add = ff_simple_idct_add;
v->vc1dsp.vc1_inv_trans_8x8_put[0] = ff_simple_idct_put;
v->vc1dsp.vc1_inv_trans_8x8_put[1] = simple_idct_put_rangered;
v->vc1dsp.vc1_inv_trans_8x8_put_signed[0] = simple_idct_put_signed;
v->vc1dsp.vc1_inv_trans_8x8_put_signed[1] = simple_idct_put_signed_rangered;
v->vc1dsp.vc1_inv_trans_8x4 = ff_simple_idct84_add;
v->vc1dsp.vc1_inv_trans_4x8 = ff_simple_idct48_add;
v->vc1dsp.vc1_inv_trans_4x4 = ff_simple_idct44_add;

@ -2009,8 +2009,7 @@ static int vc1_decode_p_block(VC1Context *v, DCTELEM block[64], int n, int mquan
if(i==1)
v->vc1dsp.vc1_inv_trans_8x8_dc(dst, linesize, block);
else{
v->vc1dsp.vc1_inv_trans_8x8(block);
s->dsp.add_pixels_clamped(block, dst, linesize);
v->vc1dsp.vc1_inv_trans_8x8_add(dst, linesize, block);
}
if(apply_filter && cbp_top & 0xC)
v->vc1dsp.vc1_v_loop_filter8(dst, linesize, v->pq);
@ -2117,7 +2116,7 @@ static int vc1_decode_p_mb(VC1Context *v)
{
MpegEncContext *s = &v->s;
GetBitContext *gb = &s->gb;
int i, j;
int i;
int mb_pos = s->mb_x + s->mb_y * s->mb_stride;
int cbp; /* cbp decoding stuff */
int mqdiff, mquant; /* MB quantization */
@ -2149,6 +2148,8 @@ static int vc1_decode_p_mb(VC1Context *v)
{
if (!skipped)
{
vc1_idct_func idct8x8_fn;
GET_MVDATA(dmv_x, dmv_y);
if (s->mb_intra) {
@ -2183,6 +2184,7 @@ static int vc1_decode_p_mb(VC1Context *v)
VC1_TTMB_VLC_BITS, 2);
if(!s->mb_intra) vc1_mc_1mv(v, 0);
dst_idx = 0;
idct8x8_fn = v->vc1dsp.vc1_inv_trans_8x8_put_signed[!!v->rangeredfrm];
for (i=0; i<6; i++)
{
s->dc_val[0][s->block_index[i]] = 0;
@ -2200,9 +2202,9 @@ static int vc1_decode_p_mb(VC1Context *v)
vc1_decode_intra_block(v, s->block[i], i, val, mquant, (i&4)?v->codingset2:v->codingset);
if((i>3) && (s->flags & CODEC_FLAG_GRAY)) continue;
v->vc1dsp.vc1_inv_trans_8x8(s->block[i]);
if(v->rangeredfrm) for(j = 0; j < 64; j++) s->block[i][j] <<= 1;
s->dsp.put_signed_pixels_clamped(s->block[i], s->dest[dst_idx] + off, i & 4 ? s->uvlinesize : s->linesize);
idct8x8_fn(s->dest[dst_idx] + off,
i & 4 ? s->uvlinesize : s->linesize,
s->block[i]);
if(v->pq >= 9 && v->overlap) {
if(v->c_avail)
v->vc1dsp.vc1_h_overlap(s->dest[dst_idx] + off, i & 4 ? s->uvlinesize : s->linesize);
@ -2267,6 +2269,7 @@ static int vc1_decode_p_mb(VC1Context *v)
{
int intra_count = 0, coded_inter = 0;
int is_intra[6], is_coded[6];
vc1_idct_func idct8x8_fn;
/* Get CBPCY */
cbp = get_vlc2(&v->s.gb, v->cbpcy_vlc->table, VC1_CBPCY_P_VLC_BITS, 2);
for (i=0; i<6; i++)
@ -2316,6 +2319,7 @@ static int vc1_decode_p_mb(VC1Context *v)
}
if (!v->ttmbf && coded_inter)
ttmb = get_vlc2(gb, ff_vc1_ttmb_vlc[v->tt_index].table, VC1_TTMB_VLC_BITS, 2);
idct8x8_fn = v->vc1dsp.vc1_inv_trans_8x8_put_signed[!!v->rangeredfrm];
for (i=0; i<6; i++)
{
dst_idx += i >> 2;
@ -2331,9 +2335,9 @@ static int vc1_decode_p_mb(VC1Context *v)
vc1_decode_intra_block(v, s->block[i], i, is_coded[i], mquant, (i&4)?v->codingset2:v->codingset);
if((i>3) && (s->flags & CODEC_FLAG_GRAY)) continue;
v->vc1dsp.vc1_inv_trans_8x8(s->block[i]);
if(v->rangeredfrm) for(j = 0; j < 64; j++) s->block[i][j] <<= 1;
s->dsp.put_signed_pixels_clamped(s->block[i], s->dest[dst_idx] + off, (i&4)?s->uvlinesize:s->linesize);
idct8x8_fn(s->dest[dst_idx] + off,
(i&4)?s->uvlinesize:s->linesize,
s->block[i]);
if(v->pq >= 9 && v->overlap) {
if(v->c_avail)
v->vc1dsp.vc1_h_overlap(s->dest[dst_idx] + off, i & 4 ? s->uvlinesize : s->linesize);
@ -2409,7 +2413,7 @@ static void vc1_decode_b_mb(VC1Context *v)
{
MpegEncContext *s = &v->s;
GetBitContext *gb = &s->gb;
int i, j;
int i;
int mb_pos = s->mb_x + s->mb_y * s->mb_stride;
int cbp = 0; /* cbp decoding stuff */
int mqdiff, mquant; /* MB quantization */
@ -2422,6 +2426,7 @@ static void vc1_decode_b_mb(VC1Context *v)
int skipped, direct;
int dmv_x[2], dmv_y[2];
int bmvtype = BMV_TYPE_BACKWARD;
vc1_idct_func idct8x8_fn;
mquant = v->pq; /* Loosy initialization */
s->mb_intra = 0;
@ -2519,6 +2524,7 @@ static void vc1_decode_b_mb(VC1Context *v)
}
}
dst_idx = 0;
idct8x8_fn = v->vc1dsp.vc1_inv_trans_8x8_put_signed[!!v->rangeredfrm];
for (i=0; i<6; i++)
{
s->dc_val[0][s->block_index[i]] = 0;
@ -2536,9 +2542,9 @@ static void vc1_decode_b_mb(VC1Context *v)
vc1_decode_intra_block(v, s->block[i], i, val, mquant, (i&4)?v->codingset2:v->codingset);
if((i>3) && (s->flags & CODEC_FLAG_GRAY)) continue;
v->vc1dsp.vc1_inv_trans_8x8(s->block[i]);
if(v->rangeredfrm) for(j = 0; j < 64; j++) s->block[i][j] <<= 1;
s->dsp.put_signed_pixels_clamped(s->block[i], s->dest[dst_idx] + off, i & 4 ? s->uvlinesize : s->linesize);
idct8x8_fn(s->dest[dst_idx] + off,
i & 4 ? s->uvlinesize : s->linesize,
s->block[i]);
} else if(val) {
vc1_decode_p_block(v, s->block[i], i, mquant, ttmb, first_block, s->dest[dst_idx] + off, (i&4)?s->uvlinesize:s->linesize, (i&4) && (s->flags & CODEC_FLAG_GRAY), 0, 0, 0);
if(!v->ttmbf && ttmb < 8) ttmb = -1;
@ -2551,11 +2557,12 @@ static void vc1_decode_b_mb(VC1Context *v)
*/
static void vc1_decode_i_blocks(VC1Context *v)
{
int k, j;
int k;
MpegEncContext *s = &v->s;
int cbp, val;
uint8_t *coded_val;
int mb_pos;
vc1_idct_func idct8x8_fn;
/* select codingmode used for VLC tables selection */
switch(v->y_ac_table_index){
@ -2590,6 +2597,10 @@ static void vc1_decode_i_blocks(VC1Context *v)
s->mb_x = s->mb_y = 0;
s->mb_intra = 1;
s->first_slice_line = 1;
if(v->pq >= 9 && v->overlap) {
idct8x8_fn = v->vc1dsp.vc1_inv_trans_8x8_put_signed[!!v->rangeredfrm];
} else
idct8x8_fn = v->vc1dsp.vc1_inv_trans_8x8_put[!!v->rangeredfrm];
for(s->mb_y = 0; s->mb_y < s->mb_height; s->mb_y++) {
s->mb_x = 0;
ff_init_block_index(s);
@ -2626,14 +2637,9 @@ static void vc1_decode_i_blocks(VC1Context *v)
vc1_decode_i_block(v, s->block[k], k, val, (k<4)? v->codingset : v->codingset2);
if (k > 3 && (s->flags & CODEC_FLAG_GRAY)) continue;
v->vc1dsp.vc1_inv_trans_8x8(s->block[k]);
if(v->pq >= 9 && v->overlap) {
if (v->rangeredfrm) for(j = 0; j < 64; j++) s->block[k][j] <<= 1;
s->dsp.put_signed_pixels_clamped(s->block[k], dst[k], k & 4 ? s->uvlinesize : s->linesize);
} else {
if (v->rangeredfrm) for(j = 0; j < 64; j++) s->block[k][j] = (s->block[k][j] - 64) << 1;
s->dsp.put_pixels_clamped(s->block[k], dst[k], k & 4 ? s->uvlinesize : s->linesize);
}
idct8x8_fn(dst[k],
k & 4 ? s->uvlinesize : s->linesize,
s->block[k]);
}
if(v->pq >= 9 && v->overlap) {
@ -2691,6 +2697,7 @@ static void vc1_decode_i_blocks_adv(VC1Context *v)
int mqdiff;
int overlap;
GetBitContext *gb = &s->gb;
vc1_idct_func idct8x8_fn;
/* select codingmode used for VLC tables selection */
switch(v->y_ac_table_index){
@ -2721,6 +2728,7 @@ static void vc1_decode_i_blocks_adv(VC1Context *v)
s->mb_x = s->mb_y = 0;
s->mb_intra = 1;
s->first_slice_line = 1;
idct8x8_fn = v->vc1dsp.vc1_inv_trans_8x8_put_signed[0];
for(s->mb_y = 0; s->mb_y < s->mb_height; s->mb_y++) {
s->mb_x = 0;
ff_init_block_index(s);
@ -2777,9 +2785,9 @@ static void vc1_decode_i_blocks_adv(VC1Context *v)
vc1_decode_i_block_adv(v, s->block[k], k, val, (k<4)? v->codingset : v->codingset2, mquant);
if (k > 3 && (s->flags & CODEC_FLAG_GRAY)) continue;
v->vc1dsp.vc1_inv_trans_8x8(s->block[k]);
s->dsp.put_signed_pixels_clamped(s->block[k], dst[k],
k & 4 ? s->uvlinesize : s->linesize);
idct8x8_fn(dst[k],
k & 4 ? s->uvlinesize : s->linesize,
s->block[k]);
}
if(overlap) {

@ -199,7 +199,7 @@ static void vc1_inv_trans_8x8_dc_c(uint8_t *dest, int linesize, DCTELEM *block)
}
}
static void vc1_inv_trans_8x8_c(DCTELEM block[64])
static av_always_inline void vc1_inv_trans_8x8_c(DCTELEM block[64], int shl, int sub)
{
int i;
register int t1,t2,t3,t4,t5,t6,t7,t8;
@ -254,20 +254,50 @@ static void vc1_inv_trans_8x8_c(DCTELEM block[64])
t3 = 9 * src[ 8] - 16 * src[24] + 4 * src[40] + 15 * src[56];
t4 = 4 * src[ 8] - 9 * src[24] + 15 * src[40] - 16 * src[56];
dst[ 0] = (t5 + t1) >> 7;
dst[ 8] = (t6 + t2) >> 7;
dst[16] = (t7 + t3) >> 7;
dst[24] = (t8 + t4) >> 7;
dst[32] = (t8 - t4 + 1) >> 7;
dst[40] = (t7 - t3 + 1) >> 7;
dst[48] = (t6 - t2 + 1) >> 7;
dst[56] = (t5 - t1 + 1) >> 7;
dst[ 0] = (((t5 + t1 ) >> 7) - sub) << shl;
dst[ 8] = (((t6 + t2 ) >> 7) - sub) << shl;
dst[16] = (((t7 + t3 ) >> 7) - sub) << shl;
dst[24] = (((t8 + t4 ) >> 7) - sub) << shl;
dst[32] = (((t8 - t4 + 1) >> 7) - sub) << shl;
dst[40] = (((t7 - t3 + 1) >> 7) - sub) << shl;
dst[48] = (((t6 - t2 + 1) >> 7) - sub) << shl;
dst[56] = (((t5 - t1 + 1) >> 7) - sub) << shl;
src++;
dst++;
}
}
static void vc1_inv_trans_8x8_add_c(uint8_t *dest, int linesize, DCTELEM *block)
{
vc1_inv_trans_8x8_c(block, 0, 0);
ff_add_pixels_clamped_c(block, dest, linesize);
}
static void vc1_inv_trans_8x8_put_signed_c(uint8_t *dest, int linesize, DCTELEM *block)
{
vc1_inv_trans_8x8_c(block, 0, 0);
ff_put_signed_pixels_clamped_c(block, dest, linesize);
}
static void vc1_inv_trans_8x8_put_signed_rangered_c(uint8_t *dest, int linesize, DCTELEM *block)
{
vc1_inv_trans_8x8_c(block, 1, 0);
ff_put_signed_pixels_clamped_c(block, dest, linesize);
}
static void vc1_inv_trans_8x8_put_c(uint8_t *dest, int linesize, DCTELEM *block)
{
vc1_inv_trans_8x8_c(block, 0, 0);
ff_put_pixels_clamped_c(block, dest, linesize);
}
static void vc1_inv_trans_8x8_put_rangered_c(uint8_t *dest, int linesize, DCTELEM *block)
{
vc1_inv_trans_8x8_c(block, 1, 64);
ff_put_pixels_clamped_c(block, dest, linesize);
}
/** Do inverse transform on 8x4 part of block
*/
static void vc1_inv_trans_8x4_dc_c(uint8_t *dest, int linesize, DCTELEM *block)
@ -662,7 +692,11 @@ static void avg_no_rnd_vc1_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*a
}
av_cold void ff_vc1dsp_init(VC1DSPContext* dsp) {
dsp->vc1_inv_trans_8x8 = vc1_inv_trans_8x8_c;
dsp->vc1_inv_trans_8x8_add = vc1_inv_trans_8x8_add_c;
dsp->vc1_inv_trans_8x8_put_signed[0] = vc1_inv_trans_8x8_put_signed_c;
dsp->vc1_inv_trans_8x8_put_signed[1] = vc1_inv_trans_8x8_put_signed_rangered_c;
dsp->vc1_inv_trans_8x8_put[0] = vc1_inv_trans_8x8_put_c;
dsp->vc1_inv_trans_8x8_put[1] = vc1_inv_trans_8x8_put_rangered_c;
dsp->vc1_inv_trans_4x8 = vc1_inv_trans_4x8_c;
dsp->vc1_inv_trans_8x4 = vc1_inv_trans_8x4_c;
dsp->vc1_inv_trans_4x4 = vc1_inv_trans_4x4_c;

@ -30,9 +30,13 @@
#include "dsputil.h"
typedef void (*vc1_idct_func)(uint8_t *dest, int line_size, DCTELEM *block);
typedef struct VC1DSPContext {
/* vc1 functions */
void (*vc1_inv_trans_8x8)(DCTELEM *b);
vc1_idct_func vc1_inv_trans_8x8_add;
vc1_idct_func vc1_inv_trans_8x8_put_signed[2];
vc1_idct_func vc1_inv_trans_8x8_put[2];
void (*vc1_inv_trans_8x4)(uint8_t *dest, int line_size, DCTELEM *block);
void (*vc1_inv_trans_4x8)(uint8_t *dest, int line_size, DCTELEM *block);
void (*vc1_inv_trans_4x4)(uint8_t *dest, int line_size, DCTELEM *block);

Loading…
Cancel
Save