avcodec/hevc: Add asm opt for the following functions

tests/checkasm/checkasm:           C       LSX     LASX
put_hevc_qpel_uni_h4_8_c:          5.7     1.2
put_hevc_qpel_uni_h6_8_c:          12.2    2.7
put_hevc_qpel_uni_h8_8_c:          21.5    3.2
put_hevc_qpel_uni_h12_8_c:         47.2    9.2     7.2
put_hevc_qpel_uni_h16_8_c:         87.0    11.7    9.0
put_hevc_qpel_uni_h24_8_c:         188.2   27.5    21.0
put_hevc_qpel_uni_h32_8_c:         335.2   46.7    28.5
put_hevc_qpel_uni_h48_8_c:         772.5   104.5   65.2
put_hevc_qpel_uni_h64_8_c:         1383.2  142.2   109.0

put_hevc_epel_uni_w_v4_8_c:        5.0     1.5
put_hevc_epel_uni_w_v6_8_c:        10.7    3.5     2.5
put_hevc_epel_uni_w_v8_8_c:        18.2    3.7     3.0
put_hevc_epel_uni_w_v12_8_c:       40.2    10.7    7.5
put_hevc_epel_uni_w_v16_8_c:       70.2    13.0    9.2
put_hevc_epel_uni_w_v24_8_c:       158.2   30.2    22.5
put_hevc_epel_uni_w_v32_8_c:       281.0   52.0    36.5
put_hevc_epel_uni_w_v48_8_c:       631.7   116.7   82.7
put_hevc_epel_uni_w_v64_8_c:       1108.2  207.5   142.2

put_hevc_epel_uni_w_h4_8_c:        4.7     1.2
put_hevc_epel_uni_w_h6_8_c:        9.7     3.5     2.7
put_hevc_epel_uni_w_h8_8_c:        17.2    4.2     3.5
put_hevc_epel_uni_w_h12_8_c:       38.0    11.5    7.2
put_hevc_epel_uni_w_h16_8_c:       69.2    14.5    9.2
put_hevc_epel_uni_w_h24_8_c:       152.0   34.7    22.5
put_hevc_epel_uni_w_h32_8_c:       271.0   58.0    40.0
put_hevc_epel_uni_w_h48_8_c:       597.5   136.7   95.0
put_hevc_epel_uni_w_h64_8_c:       1074.0  252.2   168.0

put_hevc_epel_bi_h4_8_c:           4.5     0.7
put_hevc_epel_bi_h6_8_c:           9.0     1.5
put_hevc_epel_bi_h8_8_c:           15.2    1.7
put_hevc_epel_bi_h12_8_c:          33.5    4.2     3.7
put_hevc_epel_bi_h16_8_c:          59.7    5.2     4.7
put_hevc_epel_bi_h24_8_c:          132.2   11.0
put_hevc_epel_bi_h32_8_c:          232.7   20.2    13.2
put_hevc_epel_bi_h48_8_c:          521.7   45.2    31.2
put_hevc_epel_bi_h64_8_c:          949.0   71.5    51.0

After this patch, the peformance of decoding H265 4K 30FPS
30Mbps on 3A6000 with 8 threads improves 1fps(55fps-->56fsp).

Change-Id: I8cc1e41daa63ca478039bc55d1ee8934a7423f51
Reviewed-by: yinshiyou-hf@loongson.cn
Signed-off-by: Michael Niedermayer <michael@niedermayer.cc>
release/7.0
jinbo 11 months ago committed by Michael Niedermayer
parent 1f642b99af
commit 9239081db3
No known key found for this signature in database
GPG Key ID: B18E8928B3948D64
  1. 1991
      libavcodec/loongarch/hevc_mc.S
  2. 66
      libavcodec/loongarch/hevcdsp_init_loongarch.c
  3. 54
      libavcodec/loongarch/hevcdsp_lasx.h
  4. 36
      libavcodec/loongarch/hevcdsp_lsx.h

File diff suppressed because it is too large Load Diff

@ -124,8 +124,15 @@ void ff_hevc_dsp_init_loongarch(HEVCDSPContext *c, const int bit_depth)
c->put_hevc_qpel_bi[8][0][1] = ff_hevc_put_hevc_bi_qpel_h48_8_lsx;
c->put_hevc_qpel_bi[9][0][1] = ff_hevc_put_hevc_bi_qpel_h64_8_lsx;
c->put_hevc_epel_bi[1][0][1] = ff_hevc_put_hevc_bi_epel_h4_8_lsx;
c->put_hevc_epel_bi[2][0][1] = ff_hevc_put_hevc_bi_epel_h6_8_lsx;
c->put_hevc_epel_bi[3][0][1] = ff_hevc_put_hevc_bi_epel_h8_8_lsx;
c->put_hevc_epel_bi[4][0][1] = ff_hevc_put_hevc_bi_epel_h12_8_lsx;
c->put_hevc_epel_bi[5][0][1] = ff_hevc_put_hevc_bi_epel_h16_8_lsx;
c->put_hevc_epel_bi[6][0][1] = ff_hevc_put_hevc_bi_epel_h24_8_lsx;
c->put_hevc_epel_bi[7][0][1] = ff_hevc_put_hevc_bi_epel_h32_8_lsx;
c->put_hevc_epel_bi[8][0][1] = ff_hevc_put_hevc_bi_epel_h48_8_lsx;
c->put_hevc_epel_bi[9][0][1] = ff_hevc_put_hevc_bi_epel_h64_8_lsx;
c->put_hevc_epel_bi[4][1][0] = ff_hevc_put_hevc_bi_epel_v12_8_lsx;
c->put_hevc_epel_bi[5][1][0] = ff_hevc_put_hevc_bi_epel_v16_8_lsx;
@ -138,6 +145,14 @@ void ff_hevc_dsp_init_loongarch(HEVCDSPContext *c, const int bit_depth)
c->put_hevc_epel_bi[6][1][1] = ff_hevc_put_hevc_bi_epel_hv24_8_lsx;
c->put_hevc_epel_bi[7][1][1] = ff_hevc_put_hevc_bi_epel_hv32_8_lsx;
c->put_hevc_qpel_uni[1][0][1] = ff_hevc_put_hevc_uni_qpel_h4_8_lsx;
c->put_hevc_qpel_uni[2][0][1] = ff_hevc_put_hevc_uni_qpel_h6_8_lsx;
c->put_hevc_qpel_uni[3][0][1] = ff_hevc_put_hevc_uni_qpel_h8_8_lsx;
c->put_hevc_qpel_uni[4][0][1] = ff_hevc_put_hevc_uni_qpel_h12_8_lsx;
c->put_hevc_qpel_uni[5][0][1] = ff_hevc_put_hevc_uni_qpel_h16_8_lsx;
c->put_hevc_qpel_uni[6][0][1] = ff_hevc_put_hevc_uni_qpel_h24_8_lsx;
c->put_hevc_qpel_uni[7][0][1] = ff_hevc_put_hevc_uni_qpel_h32_8_lsx;
c->put_hevc_qpel_uni[8][0][1] = ff_hevc_put_hevc_uni_qpel_h48_8_lsx;
c->put_hevc_qpel_uni[9][0][1] = ff_hevc_put_hevc_uni_qpel_h64_8_lsx;
c->put_hevc_qpel_uni[6][1][0] = ff_hevc_put_hevc_uni_qpel_v24_8_lsx;
@ -191,6 +206,26 @@ void ff_hevc_dsp_init_loongarch(HEVCDSPContext *c, const int bit_depth)
c->put_hevc_epel_uni_w[8][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels48_8_lsx;
c->put_hevc_epel_uni_w[9][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels64_8_lsx;
c->put_hevc_epel_uni_w[1][0][1] = ff_hevc_put_hevc_epel_uni_w_h4_8_lsx;
c->put_hevc_epel_uni_w[2][0][1] = ff_hevc_put_hevc_epel_uni_w_h6_8_lsx;
c->put_hevc_epel_uni_w[3][0][1] = ff_hevc_put_hevc_epel_uni_w_h8_8_lsx;
c->put_hevc_epel_uni_w[4][0][1] = ff_hevc_put_hevc_epel_uni_w_h12_8_lsx;
c->put_hevc_epel_uni_w[5][0][1] = ff_hevc_put_hevc_epel_uni_w_h16_8_lsx;
c->put_hevc_epel_uni_w[6][0][1] = ff_hevc_put_hevc_epel_uni_w_h24_8_lsx;
c->put_hevc_epel_uni_w[7][0][1] = ff_hevc_put_hevc_epel_uni_w_h32_8_lsx;
c->put_hevc_epel_uni_w[8][0][1] = ff_hevc_put_hevc_epel_uni_w_h48_8_lsx;
c->put_hevc_epel_uni_w[9][0][1] = ff_hevc_put_hevc_epel_uni_w_h64_8_lsx;
c->put_hevc_epel_uni_w[1][1][0] = ff_hevc_put_hevc_epel_uni_w_v4_8_lsx;
c->put_hevc_epel_uni_w[2][1][0] = ff_hevc_put_hevc_epel_uni_w_v6_8_lsx;
c->put_hevc_epel_uni_w[3][1][0] = ff_hevc_put_hevc_epel_uni_w_v8_8_lsx;
c->put_hevc_epel_uni_w[4][1][0] = ff_hevc_put_hevc_epel_uni_w_v12_8_lsx;
c->put_hevc_epel_uni_w[5][1][0] = ff_hevc_put_hevc_epel_uni_w_v16_8_lsx;
c->put_hevc_epel_uni_w[6][1][0] = ff_hevc_put_hevc_epel_uni_w_v24_8_lsx;
c->put_hevc_epel_uni_w[7][1][0] = ff_hevc_put_hevc_epel_uni_w_v32_8_lsx;
c->put_hevc_epel_uni_w[8][1][0] = ff_hevc_put_hevc_epel_uni_w_v48_8_lsx;
c->put_hevc_epel_uni_w[9][1][0] = ff_hevc_put_hevc_epel_uni_w_v64_8_lsx;
c->put_hevc_qpel_uni_w[3][1][1] = ff_hevc_put_hevc_uni_w_qpel_hv8_8_lsx;
c->put_hevc_qpel_uni_w[5][1][1] = ff_hevc_put_hevc_uni_w_qpel_hv16_8_lsx;
c->put_hevc_qpel_uni_w[6][1][1] = ff_hevc_put_hevc_uni_w_qpel_hv24_8_lsx;
@ -277,6 +312,15 @@ void ff_hevc_dsp_init_loongarch(HEVCDSPContext *c, const int bit_depth)
c->put_hevc_epel_uni_w[8][1][1] = ff_hevc_put_hevc_epel_uni_w_hv48_8_lasx;
c->put_hevc_epel_uni_w[9][1][1] = ff_hevc_put_hevc_epel_uni_w_hv64_8_lasx;
c->put_hevc_epel_uni_w[2][0][1] = ff_hevc_put_hevc_epel_uni_w_h6_8_lasx;
c->put_hevc_epel_uni_w[3][0][1] = ff_hevc_put_hevc_epel_uni_w_h8_8_lasx;
c->put_hevc_epel_uni_w[4][0][1] = ff_hevc_put_hevc_epel_uni_w_h12_8_lasx;
c->put_hevc_epel_uni_w[5][0][1] = ff_hevc_put_hevc_epel_uni_w_h16_8_lasx;
c->put_hevc_epel_uni_w[6][0][1] = ff_hevc_put_hevc_epel_uni_w_h24_8_lasx;
c->put_hevc_epel_uni_w[7][0][1] = ff_hevc_put_hevc_epel_uni_w_h32_8_lasx;
c->put_hevc_epel_uni_w[8][0][1] = ff_hevc_put_hevc_epel_uni_w_h48_8_lasx;
c->put_hevc_epel_uni_w[9][0][1] = ff_hevc_put_hevc_epel_uni_w_h64_8_lasx;
c->put_hevc_qpel_uni_w[3][1][0] = ff_hevc_put_hevc_qpel_uni_w_v8_8_lasx;
c->put_hevc_qpel_uni_w[4][1][0] = ff_hevc_put_hevc_qpel_uni_w_v12_8_lasx;
c->put_hevc_qpel_uni_w[5][1][0] = ff_hevc_put_hevc_qpel_uni_w_v16_8_lasx;
@ -285,6 +329,15 @@ void ff_hevc_dsp_init_loongarch(HEVCDSPContext *c, const int bit_depth)
c->put_hevc_qpel_uni_w[8][1][0] = ff_hevc_put_hevc_qpel_uni_w_v48_8_lasx;
c->put_hevc_qpel_uni_w[9][1][0] = ff_hevc_put_hevc_qpel_uni_w_v64_8_lasx;
c->put_hevc_epel_uni_w[2][1][0] = ff_hevc_put_hevc_epel_uni_w_v6_8_lasx;
c->put_hevc_epel_uni_w[3][1][0] = ff_hevc_put_hevc_epel_uni_w_v8_8_lasx;
c->put_hevc_epel_uni_w[4][1][0] = ff_hevc_put_hevc_epel_uni_w_v12_8_lasx;
c->put_hevc_epel_uni_w[5][1][0] = ff_hevc_put_hevc_epel_uni_w_v16_8_lasx;
c->put_hevc_epel_uni_w[6][1][0] = ff_hevc_put_hevc_epel_uni_w_v24_8_lasx;
c->put_hevc_epel_uni_w[7][1][0] = ff_hevc_put_hevc_epel_uni_w_v32_8_lasx;
c->put_hevc_epel_uni_w[8][1][0] = ff_hevc_put_hevc_epel_uni_w_v48_8_lasx;
c->put_hevc_epel_uni_w[9][1][0] = ff_hevc_put_hevc_epel_uni_w_v64_8_lasx;
c->put_hevc_qpel_uni_w[1][0][1] = ff_hevc_put_hevc_qpel_uni_w_h4_8_lasx;
c->put_hevc_qpel_uni_w[2][0][1] = ff_hevc_put_hevc_qpel_uni_w_h6_8_lasx;
c->put_hevc_qpel_uni_w[3][0][1] = ff_hevc_put_hevc_qpel_uni_w_h8_8_lasx;
@ -294,6 +347,19 @@ void ff_hevc_dsp_init_loongarch(HEVCDSPContext *c, const int bit_depth)
c->put_hevc_qpel_uni_w[7][0][1] = ff_hevc_put_hevc_qpel_uni_w_h32_8_lasx;
c->put_hevc_qpel_uni_w[8][0][1] = ff_hevc_put_hevc_qpel_uni_w_h48_8_lasx;
c->put_hevc_qpel_uni_w[9][0][1] = ff_hevc_put_hevc_qpel_uni_w_h64_8_lasx;
c->put_hevc_qpel_uni[4][0][1] = ff_hevc_put_hevc_uni_qpel_h12_8_lasx;
c->put_hevc_qpel_uni[5][0][1] = ff_hevc_put_hevc_uni_qpel_h16_8_lasx;
c->put_hevc_qpel_uni[6][0][1] = ff_hevc_put_hevc_uni_qpel_h24_8_lasx;
c->put_hevc_qpel_uni[7][0][1] = ff_hevc_put_hevc_uni_qpel_h32_8_lasx;
c->put_hevc_qpel_uni[8][0][1] = ff_hevc_put_hevc_uni_qpel_h48_8_lasx;
c->put_hevc_qpel_uni[9][0][1] = ff_hevc_put_hevc_uni_qpel_h64_8_lasx;
c->put_hevc_epel_bi[4][0][1] = ff_hevc_put_hevc_bi_epel_h12_8_lasx;
c->put_hevc_epel_bi[5][0][1] = ff_hevc_put_hevc_bi_epel_h16_8_lasx;
c->put_hevc_epel_bi[7][0][1] = ff_hevc_put_hevc_bi_epel_h32_8_lasx;
c->put_hevc_epel_bi[8][0][1] = ff_hevc_put_hevc_bi_epel_h48_8_lasx;
c->put_hevc_epel_bi[9][0][1] = ff_hevc_put_hevc_bi_epel_h64_8_lasx;
}
}
}

@ -75,6 +75,60 @@ PEL_UNI_W(epel, hv, 32);
PEL_UNI_W(epel, hv, 48);
PEL_UNI_W(epel, hv, 64);
PEL_UNI_W(epel, v, 6);
PEL_UNI_W(epel, v, 8);
PEL_UNI_W(epel, v, 12);
PEL_UNI_W(epel, v, 16);
PEL_UNI_W(epel, v, 24);
PEL_UNI_W(epel, v, 32);
PEL_UNI_W(epel, v, 48);
PEL_UNI_W(epel, v, 64);
PEL_UNI_W(epel, h, 6);
PEL_UNI_W(epel, h, 8);
PEL_UNI_W(epel, h, 12);
PEL_UNI_W(epel, h, 16);
PEL_UNI_W(epel, h, 24);
PEL_UNI_W(epel, h, 32);
PEL_UNI_W(epel, h, 48);
PEL_UNI_W(epel, h, 64);
#undef PEL_UNI_W
#define UNI_MC(PEL, DIR, WIDTH) \
void ff_hevc_put_hevc_uni_##PEL##_##DIR##WIDTH##_8_lasx(uint8_t *dst, \
ptrdiff_t dst_stride, \
const uint8_t *src, \
ptrdiff_t src_stride, \
int height, \
intptr_t mx, \
intptr_t my, \
int width)
UNI_MC(qpel, h, 12);
UNI_MC(qpel, h, 16);
UNI_MC(qpel, h, 24);
UNI_MC(qpel, h, 32);
UNI_MC(qpel, h, 48);
UNI_MC(qpel, h, 64);
#undef UNI_MC
#define BI_MC(PEL, DIR, WIDTH) \
void ff_hevc_put_hevc_bi_##PEL##_##DIR##WIDTH##_8_lasx(uint8_t *dst, \
ptrdiff_t dst_stride, \
const uint8_t *src, \
ptrdiff_t src_stride, \
const int16_t *src_16bit, \
int height, \
intptr_t mx, \
intptr_t my, \
int width)
BI_MC(epel, h, 12);
BI_MC(epel, h, 16);
BI_MC(epel, h, 32);
BI_MC(epel, h, 48);
BI_MC(epel, h, 64);
#undef BI_MC
#endif // #ifndef AVCODEC_LOONGARCH_HEVCDSP_LASX_H

@ -126,8 +126,15 @@ BI_MC(qpel, hv, 32);
BI_MC(qpel, hv, 48);
BI_MC(qpel, hv, 64);
BI_MC(epel, h, 4);
BI_MC(epel, h, 6);
BI_MC(epel, h, 8);
BI_MC(epel, h, 12);
BI_MC(epel, h, 16);
BI_MC(epel, h, 24);
BI_MC(epel, h, 32);
BI_MC(epel, h, 48);
BI_MC(epel, h, 64);
BI_MC(epel, v, 12);
BI_MC(epel, v, 16);
@ -151,7 +158,14 @@ void ff_hevc_put_hevc_uni_##PEL##_##DIR##WIDTH##_8_lsx(uint8_t *dst, \
intptr_t mx, \
intptr_t my, \
int width)
UNI_MC(qpel, h, 4);
UNI_MC(qpel, h, 6);
UNI_MC(qpel, h, 8);
UNI_MC(qpel, h, 12);
UNI_MC(qpel, h, 16);
UNI_MC(qpel, h, 24);
UNI_MC(qpel, h, 32);
UNI_MC(qpel, h, 48);
UNI_MC(qpel, h, 64);
UNI_MC(qpel, v, 24);
@ -287,6 +301,26 @@ PEL_UNI_W(epel, hv, 32);
PEL_UNI_W(epel, hv, 48);
PEL_UNI_W(epel, hv, 64);
PEL_UNI_W(epel, h, 4);
PEL_UNI_W(epel, h, 6);
PEL_UNI_W(epel, h, 8);
PEL_UNI_W(epel, h, 12);
PEL_UNI_W(epel, h, 16);
PEL_UNI_W(epel, h, 24);
PEL_UNI_W(epel, h, 32);
PEL_UNI_W(epel, h, 48);
PEL_UNI_W(epel, h, 64);
PEL_UNI_W(epel, v, 4);
PEL_UNI_W(epel, v, 6);
PEL_UNI_W(epel, v, 8);
PEL_UNI_W(epel, v, 12);
PEL_UNI_W(epel, v, 16);
PEL_UNI_W(epel, v, 24);
PEL_UNI_W(epel, v, 32);
PEL_UNI_W(epel, v, 48);
PEL_UNI_W(epel, v, 64);
#undef PEL_UNI_W
#endif // #ifndef AVCODEC_LOONGARCH_HEVCDSP_LSX_H

Loading…
Cancel
Save