avcodec/vvcdec: refact out deblock boundary strength stage

The deblock boundary strength stage utilizes ~5% of CPU resources for 8K clips.
It's worth considering it as a standalone stage. This stage has been relocated
to follow the parser process, allowing us to reuse CUs and TUs before releasing them.
master
Nuo Mi 1 month ago
parent 48a1a12968
commit 634780f3cf
  1. 27
      libavcodec/vvc/filter.c
  2. 9
      libavcodec/vvc/filter.h
  3. 24
      libavcodec/vvc/thread.c

@ -678,12 +678,14 @@ static void vvc_deblock_bs_chroma(const VVCLocalContext *lc,
typedef void (*deblock_bs_fn)(const VVCLocalContext *lc, const int x0, const int y0,
const int width, const int height, const int rs, const int vertical);
static void vvc_deblock_bs(const VVCLocalContext *lc, const int x0, const int y0, const int rs, const int vertical)
void ff_vvc_deblock_bs(VVCLocalContext *lc, const int rx, const int ry, const int rs)
{
const VVCFrameContext *fc = lc->fc;
const VVCSPS *sps = fc->ps.sps;
const VVCPPS *pps = fc->ps.pps;
const int ctb_size = sps->ctb_size_y;
const int x0 = rx << sps->ctb_log2_size_y;
const int y0 = ry << sps->ctb_log2_size_y;
const int x_end = FFMIN(x0 + ctb_size, pps->width) >> MIN_TU_LOG2;
const int y_end = FFMIN(y0 + ctb_size, pps->height) >> MIN_TU_LOG2;
const int has_chroma = !!sps->r->sps_chroma_format_idc;
@ -691,15 +693,18 @@ static void vvc_deblock_bs(const VVCLocalContext *lc, const int x0, const int y0
vvc_deblock_bs_luma, vvc_deblock_bs_chroma
};
for (int is_chroma = 0; is_chroma <= has_chroma; is_chroma++) {
const int hs = sps->hshift[is_chroma];
const int vs = sps->vshift[is_chroma];
for (int y = y0 >> MIN_TU_LOG2; y < y_end; y++) {
for (int x = x0 >> MIN_TU_LOG2; x < x_end; x++) {
const int off = y * fc->ps.pps->min_tu_width + x;
if ((fc->tab.tb_pos_x0[is_chroma][off] >> MIN_TU_LOG2) == x && (fc->tab.tb_pos_y0[is_chroma][off] >> MIN_TU_LOG2) == y) {
deblock_bs[is_chroma](lc, x << MIN_TU_LOG2, y << MIN_TU_LOG2,
fc->tab.tb_width[is_chroma][off] << hs, fc->tab.tb_height[is_chroma][off] << vs, rs, vertical);
ff_vvc_decode_neighbour(lc, x0, y0, rx, ry, rs);
for (int vertical = 0; vertical <= 1; vertical++) {
for (int is_chroma = 0; is_chroma <= has_chroma; is_chroma++) {
const int hs = sps->hshift[is_chroma];
const int vs = sps->vshift[is_chroma];
for (int y = y0 >> MIN_TU_LOG2; y < y_end; y++) {
for (int x = x0 >> MIN_TU_LOG2; x < x_end; x++) {
const int off = y * fc->ps.pps->min_tu_width + x;
if ((fc->tab.tb_pos_x0[is_chroma][off] >> MIN_TU_LOG2) == x && (fc->tab.tb_pos_y0[is_chroma][off] >> MIN_TU_LOG2) == y) {
deblock_bs[is_chroma](lc, x << MIN_TU_LOG2, y << MIN_TU_LOG2,
fc->tab.tb_width[is_chroma][off] << hs, fc->tab.tb_height[is_chroma][off] << vs, rs, vertical);
}
}
}
}
@ -795,8 +800,6 @@ static void vvc_deblock(const VVCLocalContext *lc, int x0, int y0, const int rs,
const uint8_t no_p[4] = { 0 };
const uint8_t no_q[4] = { 0 } ;
vvc_deblock_bs(lc, x0, y0, rs, vertical);
if (!vertical) {
FFSWAP(int, x_end, y_end);
FFSWAP(int, x0, y0);

@ -33,6 +33,15 @@
*/
void ff_vvc_lmcs_filter(const VVCLocalContext *lc, const int x0, const int y0);
/**
* derive boundary strength for the CTU
* @param lc local context for CTU
* @param rx raster x position for the CTU
* @param ry raster y position for the CTU
* @param rs raster position for the CTU
*/
void ff_vvc_deblock_bs(VVCLocalContext *lc, const int rx, const int ry, const int rs);
/**
* vertical deblock filter for the CTU
* @param lc local context for CTU

@ -42,6 +42,7 @@ typedef struct ProgressListener {
typedef enum VVCTaskStage {
VVC_TASK_STAGE_INIT, // for CTU(0, 0) only
VVC_TASK_STAGE_PARSE,
VVC_TASK_STAGE_DEBLOCK_BS,
VVC_TASK_STAGE_INTER,
VVC_TASK_STAGE_RECON,
VVC_TASK_STAGE_LMCS,
@ -111,6 +112,7 @@ static void add_task(VVCContext *s, VVCTask *t)
const int priorities[] = {
0, // VVC_TASK_STAGE_INIT,
0, // VVC_TASK_STAGE_PARSE,
1, // VVC_TASK_STAGE_DEBLOCK_BS
// For an 8K clip, a CTU line completed in the reference frame may trigger 64 and more inter tasks.
// We assign these tasks the lowest priority to avoid being overwhelmed with inter tasks.
PRIORITY_LOWEST, // VVC_TASK_STAGE_INTER
@ -181,6 +183,8 @@ static int task_has_target_score(VVCTask *t, const VVCTaskStage stage, const uin
// l:left, r:right, t: top, b: bottom
static const uint8_t target_score[] =
{
2, //VVC_TASK_STAGE_DEBLOCK_BS,need l + t parse
0, //VVC_TASK_STAGE_INTER, not used
2, //VVC_TASK_STAGE_RECON, need l + rt recon
3, //VVC_TASK_STAGE_LMCS, need r + b + rb recon
1, //VVC_TASK_STAGE_DEBLOCK_V, need l deblock v
@ -202,7 +206,7 @@ static int task_has_target_score(VVCTask *t, const VVCTaskStage stage, const uin
} else if (stage == VVC_TASK_STAGE_INTER) {
target = atomic_load(&t->target_inter_score);
} else {
target = target_score[stage - VVC_TASK_STAGE_RECON];
target = target_score[stage - VVC_TASK_STAGE_DEBLOCK_BS];
}
//+1 for previous stage
@ -348,6 +352,10 @@ static void task_stage_done(const VVCTask *t, VVCContext *s)
//this is a reserve map of ready_score, ordered by zigzag
if (stage == VVC_TASK_STAGE_PARSE) {
ADD( 0, 1, VVC_TASK_STAGE_DEBLOCK_BS);
ADD( 1, 0, VVC_TASK_STAGE_DEBLOCK_BS);
if (t->rx < 0 || t->rx >= ft->ctu_width || t->ry < 0 || t->ry >= ft->ctu_height)
return;
parse_task_done(s, fc, t->rx, t->ry);
} else if (stage == VVC_TASK_STAGE_RECON) {
ADD(-1, 1, VVC_TASK_STAGE_RECON);
@ -481,6 +489,14 @@ static int run_parse(VVCContext *s, VVCLocalContext *lc, VVCTask *t)
return 0;
}
static int run_deblock_bs(VVCContext *s, VVCLocalContext *lc, VVCTask *t)
{
if (!lc->sc->sh.r->sh_deblocking_filter_disabled_flag)
ff_vvc_deblock_bs(lc, t->rx, t->ry, t->rs);
return 0;
}
static int run_inter(VVCContext *s, VVCLocalContext *lc, VVCTask *t)
{
VVCFrameContext *fc = lc->fc;
@ -590,6 +606,7 @@ static int run_alf(VVCContext *s, VVCLocalContext *lc, VVCTask *t)
const static char* task_name[] = {
"INIT",
"P",
"B",
"I",
"R",
"L",
@ -611,6 +628,7 @@ static void task_run_stage(VVCTask *t, VVCContext *s, VVCLocalContext *lc)
static const run_func run[] = {
run_init,
run_parse,
run_deblock_bs,
run_inter,
run_recon,
run_lmcs,
@ -701,9 +719,9 @@ static void frame_thread_init_score(VVCFrameContext *fc)
const VVCFrameThread *ft = fc->ft;
VVCTask task;
task_init(&task, VVC_TASK_STAGE_RECON, fc, 0, 0);
task_init(&task, VVC_TASK_STAGE_PARSE, fc, 0, 0);
for (int i = VVC_TASK_STAGE_RECON; i < VVC_TASK_STAGE_LAST; i++) {
for (int i = VVC_TASK_STAGE_PARSE; i < VVC_TASK_STAGE_LAST; i++) {
task.stage = i;
for (task.rx = -1; task.rx <= ft->ctu_width; task.rx++) {

Loading…
Cancel
Save