hevcdsp: split the qpel functions by width instead of by the subpixel fraction

This should allow for more efficient SIMD.

Keep the C versions as they are now, to allow the compiler to inline the
interpolation coefficients.
pull/166/head
Anton Khirnov 10 years ago
parent 6788baebb3
commit 1f821750f0
  1. 19
      libavcodec/hevc.c
  2. 30
      libavcodec/hevcdsp.c
  3. 6
      libavcodec/hevcdsp.h
  4. 82
      libavcodec/hevcdsp_template.c

@ -1479,7 +1479,7 @@ static void hls_mvd_coding(HEVCContext *s, int x0, int y0, int log2_cb_size)
*/ */
static void luma_mc(HEVCContext *s, int16_t *dst, ptrdiff_t dststride, static void luma_mc(HEVCContext *s, int16_t *dst, ptrdiff_t dststride,
AVFrame *ref, const Mv *mv, int x_off, int y_off, AVFrame *ref, const Mv *mv, int x_off, int y_off,
int block_w, int block_h) int block_w, int block_h, int pred_idx)
{ {
HEVCLocalContext *lc = &s->HEVClc; HEVCLocalContext *lc = &s->HEVClc;
uint8_t *src = ref->data[0]; uint8_t *src = ref->data[0];
@ -1513,8 +1513,8 @@ static void luma_mc(HEVCContext *s, int16_t *dst, ptrdiff_t dststride,
src = lc->edge_emu_buffer + buf_offset; src = lc->edge_emu_buffer + buf_offset;
srcstride = edge_emu_stride; srcstride = edge_emu_stride;
} }
s->hevcdsp.put_hevc_qpel[my][mx](dst, dststride, src, srcstride, block_w, s->hevcdsp.put_hevc_qpel[!!my][!!mx][pred_idx](dst, dststride, src, srcstride,
block_h, lc->mc_buffer); block_h, mx, my, lc->mc_buffer);
} }
/** /**
@ -1651,6 +1651,11 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
int nPbW, int nPbH, int nPbW, int nPbH,
int log2_cb_size, int partIdx) int log2_cb_size, int partIdx)
{ {
static const int pred_indices[] = {
[4] = 0, [8] = 1, [12] = 2, [16] = 3, [24] = 4, [32] = 5, [48] = 6, [64] = 7,
};
const int pred_idx = pred_indices[nPbW];
#define POS(c_idx, x, y) \ #define POS(c_idx, x, y) \
&s->frame->data[c_idx][((y) >> s->ps.sps->vshift[c_idx]) * s->frame->linesize[c_idx] + \ &s->frame->data[c_idx][((y) >> s->ps.sps->vshift[c_idx]) * s->frame->linesize[c_idx] + \
(((x) >> s->ps.sps->hshift[c_idx]) << s->ps.sps->pixel_shift)] (((x) >> s->ps.sps->hshift[c_idx]) << s->ps.sps->pixel_shift)]
@ -1719,7 +1724,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
DECLARE_ALIGNED(16, int16_t, tmp2[MAX_PB_SIZE * MAX_PB_SIZE]); DECLARE_ALIGNED(16, int16_t, tmp2[MAX_PB_SIZE * MAX_PB_SIZE]);
luma_mc(s, tmp, tmpstride, ref0->frame, luma_mc(s, tmp, tmpstride, ref0->frame,
&current_mv.mv[0], x0, y0, nPbW, nPbH); &current_mv.mv[0], x0, y0, nPbW, nPbH, pred_idx);
if ((s->sh.slice_type == P_SLICE && s->ps.pps->weighted_pred_flag) || if ((s->sh.slice_type == P_SLICE && s->ps.pps->weighted_pred_flag) ||
(s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag)) { (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag)) {
@ -1755,7 +1760,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
DECLARE_ALIGNED(16, int16_t, tmp2[MAX_PB_SIZE * MAX_PB_SIZE]); DECLARE_ALIGNED(16, int16_t, tmp2[MAX_PB_SIZE * MAX_PB_SIZE]);
luma_mc(s, tmp, tmpstride, ref1->frame, luma_mc(s, tmp, tmpstride, ref1->frame,
&current_mv.mv[1], x0, y0, nPbW, nPbH); &current_mv.mv[1], x0, y0, nPbW, nPbH, pred_idx);
if ((s->sh.slice_type == P_SLICE && s->ps.pps->weighted_pred_flag) || if ((s->sh.slice_type == P_SLICE && s->ps.pps->weighted_pred_flag) ||
(s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag)) { (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag)) {
@ -1792,9 +1797,9 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
DECLARE_ALIGNED(16, int16_t, tmp4[MAX_PB_SIZE * MAX_PB_SIZE]); DECLARE_ALIGNED(16, int16_t, tmp4[MAX_PB_SIZE * MAX_PB_SIZE]);
luma_mc(s, tmp, tmpstride, ref0->frame, luma_mc(s, tmp, tmpstride, ref0->frame,
&current_mv.mv[0], x0, y0, nPbW, nPbH); &current_mv.mv[0], x0, y0, nPbW, nPbH, pred_idx);
luma_mc(s, tmp2, tmpstride, ref1->frame, luma_mc(s, tmp2, tmpstride, ref1->frame,
&current_mv.mv[1], x0, y0, nPbW, nPbH); &current_mv.mv[1], x0, y0, nPbW, nPbH, pred_idx);
if ((s->sh.slice_type == P_SLICE && s->ps.pps->weighted_pred_flag) || if ((s->sh.slice_type == P_SLICE && s->ps.pps->weighted_pred_flag) ||
(s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag)) { (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag)) {

@ -116,6 +116,12 @@ void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth)
#undef FUNC #undef FUNC
#define FUNC(a, depth) a ## _ ## depth #define FUNC(a, depth) a ## _ ## depth
#define QPEL_FUNC(i, width, depth) \
hevcdsp->put_hevc_qpel[0][0][i] = FUNC(put_hevc_qpel_pixels_ ## width, depth); \
hevcdsp->put_hevc_qpel[0][1][i] = FUNC(put_hevc_qpel_h_ ## width, depth); \
hevcdsp->put_hevc_qpel[1][0][i] = FUNC(put_hevc_qpel_v_ ## width, depth); \
hevcdsp->put_hevc_qpel[1][1][i] = FUNC(put_hevc_qpel_hv_ ## width, depth); \
#define HEVC_DSP(depth) \ #define HEVC_DSP(depth) \
hevcdsp->put_pcm = FUNC(put_pcm, depth); \ hevcdsp->put_pcm = FUNC(put_pcm, depth); \
hevcdsp->transquant_bypass[0] = FUNC(transquant_bypass4x4, depth); \ hevcdsp->transquant_bypass[0] = FUNC(transquant_bypass4x4, depth); \
@ -139,22 +145,14 @@ void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth)
hevcdsp->sao_edge_filter[2] = FUNC(sao_edge_filter_2, depth); \ hevcdsp->sao_edge_filter[2] = FUNC(sao_edge_filter_2, depth); \
hevcdsp->sao_edge_filter[3] = FUNC(sao_edge_filter_3, depth); \ hevcdsp->sao_edge_filter[3] = FUNC(sao_edge_filter_3, depth); \
\ \
hevcdsp->put_hevc_qpel[0][0] = FUNC(put_hevc_qpel_pixels, depth); \ QPEL_FUNC(0, 4, depth); \
hevcdsp->put_hevc_qpel[0][1] = FUNC(put_hevc_qpel_h1, depth); \ QPEL_FUNC(1, 8, depth); \
hevcdsp->put_hevc_qpel[0][2] = FUNC(put_hevc_qpel_h2, depth); \ QPEL_FUNC(2, 12, depth); \
hevcdsp->put_hevc_qpel[0][3] = FUNC(put_hevc_qpel_h3, depth); \ QPEL_FUNC(3, 16, depth); \
hevcdsp->put_hevc_qpel[1][0] = FUNC(put_hevc_qpel_v1, depth); \ QPEL_FUNC(4, 24, depth); \
hevcdsp->put_hevc_qpel[1][1] = FUNC(put_hevc_qpel_h1v1, depth); \ QPEL_FUNC(5, 32, depth); \
hevcdsp->put_hevc_qpel[1][2] = FUNC(put_hevc_qpel_h2v1, depth); \ QPEL_FUNC(6, 48, depth); \
hevcdsp->put_hevc_qpel[1][3] = FUNC(put_hevc_qpel_h3v1, depth); \ QPEL_FUNC(7, 64, depth); \
hevcdsp->put_hevc_qpel[2][0] = FUNC(put_hevc_qpel_v2, depth); \
hevcdsp->put_hevc_qpel[2][1] = FUNC(put_hevc_qpel_h1v2, depth); \
hevcdsp->put_hevc_qpel[2][2] = FUNC(put_hevc_qpel_h2v2, depth); \
hevcdsp->put_hevc_qpel[2][3] = FUNC(put_hevc_qpel_h3v2, depth); \
hevcdsp->put_hevc_qpel[3][0] = FUNC(put_hevc_qpel_v3, depth); \
hevcdsp->put_hevc_qpel[3][1] = FUNC(put_hevc_qpel_h1v3, depth); \
hevcdsp->put_hevc_qpel[3][2] = FUNC(put_hevc_qpel_h2v3, depth); \
hevcdsp->put_hevc_qpel[3][3] = FUNC(put_hevc_qpel_h3v3, depth); \
\ \
hevcdsp->put_hevc_epel[0][0] = FUNC(put_hevc_epel_pixels, depth); \ hevcdsp->put_hevc_epel[0][0] = FUNC(put_hevc_epel_pixels, depth); \
hevcdsp->put_hevc_epel[0][1] = FUNC(put_hevc_epel_h, depth); \ hevcdsp->put_hevc_epel[0][1] = FUNC(put_hevc_epel_h, depth); \

@ -58,9 +58,9 @@ typedef struct HEVCDSPContext {
int height, int c_idx, uint8_t vert_edge, int height, int c_idx, uint8_t vert_edge,
uint8_t horiz_edge, uint8_t diag_edge); uint8_t horiz_edge, uint8_t diag_edge);
void (*put_hevc_qpel[4][4])(int16_t *dst, ptrdiff_t dststride, uint8_t *src, void (*put_hevc_qpel[2][2][8])(int16_t *dst, ptrdiff_t dststride, uint8_t *src,
ptrdiff_t srcstride, int width, int height, ptrdiff_t srcstride, int height,
int16_t *mcbuffer); int mx, int my, int16_t *mcbuffer);
void (*put_hevc_epel[2][2])(int16_t *dst, ptrdiff_t dststride, uint8_t *src, void (*put_hevc_epel[2][2])(int16_t *dst, ptrdiff_t dststride, uint8_t *src,
ptrdiff_t srcstride, int width, int height, ptrdiff_t srcstride, int width, int height,
int mx, int my, int16_t *mcbuffer); int mx, int my, int16_t *mcbuffer);

@ -775,9 +775,11 @@ static void FUNC(sao_edge_filter_3)(uint8_t *_dst, uint8_t *_src,
#undef TR_16 #undef TR_16
#undef TR_32 #undef TR_32
static void FUNC(put_hevc_qpel_pixels)(int16_t *dst, ptrdiff_t dststride, static av_always_inline void
uint8_t *_src, ptrdiff_t _srcstride, FUNC(put_hevc_qpel_pixels)(int16_t *dst, ptrdiff_t dststride,
int width, int height, int16_t* mcbuffer) uint8_t *_src, ptrdiff_t _srcstride,
int width, int height, int mx, int my,
int16_t* mcbuffer)
{ {
int x, y; int x, y;
pixel *src = (pixel *)_src; pixel *src = (pixel *)_src;
@ -906,6 +908,80 @@ PUT_HEVC_QPEL_HV(3, 1)
PUT_HEVC_QPEL_HV(3, 2) PUT_HEVC_QPEL_HV(3, 2)
PUT_HEVC_QPEL_HV(3, 3) PUT_HEVC_QPEL_HV(3, 3)
#define QPEL(W) \
static void FUNC(put_hevc_qpel_pixels_ ## W)(int16_t *dst, ptrdiff_t dststride, \
uint8_t *src, ptrdiff_t srcstride, \
int height, int mx, int my, \
int16_t *mcbuffer) \
{ \
FUNC(put_hevc_qpel_pixels)(dst, dststride, src, srcstride, W, height, \
mx, my, mcbuffer); \
} \
\
static void FUNC(put_hevc_qpel_h_ ## W)(int16_t *dst, ptrdiff_t dststride, \
uint8_t *src, ptrdiff_t srcstride, \
int height, int mx, int my, \
int16_t *mcbuffer) \
{ \
if (mx == 1) \
FUNC(put_hevc_qpel_h1)(dst, dststride, src, srcstride, W, height, mcbuffer); \
else if (mx == 2) \
FUNC(put_hevc_qpel_h2)(dst, dststride, src, srcstride, W, height, mcbuffer); \
else \
FUNC(put_hevc_qpel_h3)(dst, dststride, src, srcstride, W, height, mcbuffer); \
} \
\
static void FUNC(put_hevc_qpel_v_ ## W)(int16_t *dst, ptrdiff_t dststride, \
uint8_t *src, ptrdiff_t srcstride, \
int height, int mx, int my, \
int16_t *mcbuffer) \
{ \
if (my == 1) \
FUNC(put_hevc_qpel_v1)(dst, dststride, src, srcstride, W, height, mcbuffer); \
else if (my == 2) \
FUNC(put_hevc_qpel_v2)(dst, dststride, src, srcstride, W, height, mcbuffer); \
else \
FUNC(put_hevc_qpel_v3)(dst, dststride, src, srcstride, W, height, mcbuffer); \
} \
\
static void FUNC(put_hevc_qpel_hv_ ## W)(int16_t *dst, ptrdiff_t dststride, \
uint8_t *src, ptrdiff_t srcstride, \
int height, int mx, int my, \
int16_t *mcbuffer) \
{ \
if (my == 1) { \
if (mx == 1) \
FUNC(put_hevc_qpel_h1v1)(dst, dststride, src, srcstride, W, height, mcbuffer); \
else if (mx == 2) \
FUNC(put_hevc_qpel_h2v1)(dst, dststride, src, srcstride, W, height, mcbuffer); \
else \
FUNC(put_hevc_qpel_h3v1)(dst, dststride, src, srcstride, W, height, mcbuffer); \
} else if (my == 2) { \
if (mx == 1) \
FUNC(put_hevc_qpel_h1v2)(dst, dststride, src, srcstride, W, height, mcbuffer); \
else if (mx == 2) \
FUNC(put_hevc_qpel_h2v2)(dst, dststride, src, srcstride, W, height, mcbuffer); \
else \
FUNC(put_hevc_qpel_h3v2)(dst, dststride, src, srcstride, W, height, mcbuffer); \
} else { \
if (mx == 1) \
FUNC(put_hevc_qpel_h1v3)(dst, dststride, src, srcstride, W, height, mcbuffer); \
else if (mx == 2) \
FUNC(put_hevc_qpel_h2v3)(dst, dststride, src, srcstride, W, height, mcbuffer); \
else \
FUNC(put_hevc_qpel_h3v3)(dst, dststride, src, srcstride, W, height, mcbuffer); \
} \
}
QPEL(64)
QPEL(48)
QPEL(32)
QPEL(24)
QPEL(16)
QPEL(12)
QPEL(8)
QPEL(4)
static void FUNC(put_hevc_epel_pixels)(int16_t *dst, ptrdiff_t dststride, static void FUNC(put_hevc_epel_pixels)(int16_t *dst, ptrdiff_t dststride,
uint8_t *_src, ptrdiff_t _srcstride, uint8_t *_src, ptrdiff_t _srcstride,
int width, int height, int mx, int my, int width, int height, int mx, int my,

Loading…
Cancel
Save