From c43485f70765cb488bfdf95dc783bb9b14eb1179 Mon Sep 17 00:00:00 2001 From: Rostislav Pehlivanov Date: Thu, 23 Jun 2016 18:07:00 +0100 Subject: [PATCH] diracdec: rewrite HQ slice decoding Now coefficients are written to a buffer and are then dequantized by the new SIMD dequantization functions. For the lower bands without enough coefficients to fill a register (and hence they overwrite) the C version of the dequantization function is used. The buffer is per-thread and will be realloc'd if anything changes. This prevents regressions and having to limit slice size. Signed-off-by: Rostislav Pehlivanov --- libavcodec/diracdec.c | 126 +++++++++++++++++++++++++++++++++++++----- 1 file changed, 111 insertions(+), 15 deletions(-) diff --git a/libavcodec/diracdec.c b/libavcodec/diracdec.c index ad33809413..7913656991 100644 --- a/libavcodec/diracdec.c +++ b/libavcodec/diracdec.c @@ -161,6 +161,10 @@ typedef struct DiracContext { unsigned num_x; /* number of horizontal slices */ unsigned num_y; /* number of vertical slices */ + uint8_t *thread_buf; /* Per-thread buffer for coefficient storage */ + int threads_num_buf; /* Current # of buffers allocated */ + int thread_buf_size; /* Each thread has a buffer this size */ + struct { unsigned width; unsigned height; @@ -370,6 +374,10 @@ static av_cold int dirac_decode_init(AVCodecContext *avctx) s->avctx = avctx; s->frame_number = -1; + s->thread_buf = NULL; + s->threads_num_buf = -1; + s->thread_buf_size = -1; + ff_diracdsp_init(&s->diracdsp); ff_mpegvideoencdsp_init(&s->mpvencdsp, avctx); ff_videodsp_init(&s->vdsp, 8); @@ -403,6 +411,8 @@ static av_cold int dirac_decode_end(AVCodecContext *avctx) for (i = 0; i < MAX_FRAMES; i++) av_frame_free(&s->all_frames[i].avframe); + av_freep(&s->thread_buf); + return 0; } @@ -760,46 +770,108 @@ static int decode_lowdelay_slice(AVCodecContext *avctx, void *arg) return 0; } +typedef struct SliceCoeffs { + int left; + int top; + int tot_h; + int tot_v; + int tot; +} SliceCoeffs; + +static int subband_coeffs(DiracContext *s, int x, int y, int p, + SliceCoeffs c[MAX_DWT_LEVELS]) +{ + int level, coef = 0; + for (level = 0; level < s->wavelet_depth; level++) { + SliceCoeffs *o = &c[level]; + SubBand *b = &s->plane[p].band[level][3]; /* orientation doens't matter */ + o->top = b->height * y / s->num_y; + o->left = b->width * x / s->num_x; + o->tot_h = ((b->width * (x + 1)) / s->num_x) - o->left; + o->tot_v = ((b->height * (y + 1)) / s->num_y) - o->top; + o->tot = o->tot_h*o->tot_v; + coef += o->tot * (4 - !!level); + } + return coef; +} + /** * VC-2 Specification -> * 13.5.3 hq_slice(sx,sy) */ -static int decode_hq_slice(AVCodecContext *avctx, void *arg) +static int decode_hq_slice(DiracContext *s, DiracSlice *slice, uint8_t *tmp_buf) { - int i, quant, level, orientation, quant_idx; - uint8_t quants[MAX_DWT_LEVELS][4]; - DiracContext *s = avctx->priv_data; - DiracSlice *slice = arg; + int i, level, orientation, quant_idx; + int qfactor[MAX_DWT_LEVELS][4], qoffset[MAX_DWT_LEVELS][4]; GetBitContext *gb = &slice->gb; + SliceCoeffs coeffs_num[MAX_DWT_LEVELS]; skip_bits_long(gb, 8*s->highquality.prefix_bytes); quant_idx = get_bits(gb, 8); + if (quant_idx > DIRAC_MAX_QUANT_INDEX) { + av_log(s->avctx, AV_LOG_ERROR, "Invalid quantization index - %i\n", quant_idx); + return AVERROR_INVALIDDATA; + } + /* Slice quantization (slice_quantizers() in the specs) */ for (level = 0; level < s->wavelet_depth; level++) { for (orientation = !!level; orientation < 4; orientation++) { - quant = FFMAX(quant_idx - s->lowdelay.quant[level][orientation], 0); - quants[level][orientation] = quant; + const int quant = FFMAX(quant_idx - s->lowdelay.quant[level][orientation], 0); + qfactor[level][orientation] = ff_dirac_qscale_tab[quant]; + qoffset[level][orientation] = ff_dirac_qoffset_intra_tab[quant] + 2; } } /* Luma + 2 Chroma planes */ for (i = 0; i < 3; i++) { - int64_t length = s->highquality.size_scaler * get_bits(gb, 8); - int64_t bits_left = 8 * length; - int64_t bits_end = get_bits_count(gb) + bits_left; + int c, coef_num, coef_par, off = 0; + int64_t length = s->highquality.size_scaler*get_bits(gb, 8); + int64_t start = get_bits_count(gb); + int64_t bits_end = start + 8*length; if (bits_end >= INT_MAX) { av_log(s->avctx, AV_LOG_ERROR, "end too far away\n"); return AVERROR_INVALIDDATA; } + coef_num = subband_coeffs(s, slice->slice_x, slice->slice_y, i, coeffs_num); + + if (s->pshift) { + int32_t *dst = (int32_t *)tmp_buf; + for (c = 0; c < coef_num; c++) + dst[c] = dirac_get_se_golomb(gb); + coef_par = c; + } else { + int16_t *dst = (int16_t *)tmp_buf; + for (c = 0; c < coef_num; c++) + dst[c] = dirac_get_se_golomb(gb); + coef_par = c; + } + + if (coef_num > coef_par) { + const int start_b = coef_par * (4 >> s->pshift); + const int end_b = coef_num * (4 >> s->pshift); + memset(&tmp_buf[start_b], 0, end_b - start_b); + } + for (level = 0; level < s->wavelet_depth; level++) { + const SliceCoeffs *c = &coeffs_num[level]; for (orientation = !!level; orientation < 4; orientation++) { - decode_subband(s, gb, quants[level][orientation], slice->slice_x, slice->slice_y, bits_end, - &s->plane[i].band[level][orientation], NULL); + const SubBand *b1 = &s->plane[i].band[level][orientation]; + uint8_t *buf = b1->ibuf + c->top * b1->stride + (c->left << (s->pshift + 1)); + + /* Change to c->tot_h <= 4 for AVX2 dequantization */ + const int qfunc = s->pshift + 2*(c->tot_h <= 2); + s->diracdsp.dequant_subband[qfunc](&tmp_buf[off], buf, b1->stride, + qfactor[level][orientation], + qoffset[level][orientation], + c->tot_v, c->tot_h); + + off += c->tot << (s->pshift + 1); } } + skip_bits_long(gb, bits_end - get_bits_count(gb)); } @@ -811,8 +883,9 @@ static int decode_hq_slice_row(AVCodecContext *avctx, void *arg, int jobnr, int int i; DiracContext *s = avctx->priv_data; DiracSlice *slices = ((DiracSlice *)arg) + s->num_x*jobnr; + uint8_t *thread_buf = &s->thread_buf[s->thread_buf_size*threadnr]; for (i = 0; i < s->num_x; i++) - decode_hq_slice(avctx, &slices[i]); + decode_hq_slice(s, &slices[i], thread_buf); return 0; } @@ -824,15 +897,32 @@ static int decode_lowdelay(DiracContext *s) { AVCodecContext *avctx = s->avctx; int slice_x, slice_y, bufsize; - int64_t bytes = 0; + int64_t coef_buf_size, bytes = 0; const uint8_t *buf; DiracSlice *slices; + SliceCoeffs tmp[MAX_DWT_LEVELS]; int slice_num = 0; slices = av_mallocz_array(s->num_x, s->num_y * sizeof(DiracSlice)); if (!slices) return AVERROR(ENOMEM); + /* 8 becacuse that's how much the golomb reader could overread junk data + * from another plane/slice at most, and 512 because SIMD */ + coef_buf_size = subband_coeffs(s, s->num_x - 1, s->num_y - 1, 0, tmp) + 8; + coef_buf_size = (coef_buf_size << (1 + s->pshift)) + 512; + + if (s->threads_num_buf != avctx->thread_count || + s->thread_buf_size != coef_buf_size) { + s->threads_num_buf = avctx->thread_count; + s->thread_buf_size = coef_buf_size; + s->thread_buf = av_realloc_f(s->thread_buf, avctx->thread_count, s->thread_buf_size); + if (!s->thread_buf) { + av_log(s->avctx, AV_LOG_ERROR, "thread buffer allocation failure\n"); + return AVERROR(ENOMEM); + } + } + align_get_bits(&s->gb); /*[DIRAC_STD] 13.5.2 Slices. slice(sx,sy) */ buf = s->gb.buffer + get_bits_count(&s->gb)/8; @@ -848,7 +938,7 @@ static int decode_lowdelay(DiracContext *s) if (bytes <= bufsize/8) bytes += buf[bytes] * s->highquality.size_scaler + 1; } - if (bytes >= INT_MAX) { + if (bytes >= INT_MAX || bytes*8 > bufsize) { av_log(s->avctx, AV_LOG_ERROR, "too many bytes\n"); av_free(slices); return AVERROR_INVALIDDATA; @@ -867,6 +957,12 @@ static int decode_lowdelay(DiracContext *s) bufsize = 0; } } + + if (s->num_x*s->num_y != slice_num) { + av_log(s->avctx, AV_LOG_ERROR, "too few slices\n"); + return AVERROR_INVALIDDATA; + } + avctx->execute2(avctx, decode_hq_slice_row, slices, NULL, s->num_y); } else { for (slice_y = 0; bufsize > 0 && slice_y < s->num_y; slice_y++) {