avcodec/hevc: Add epel_uni_w_hv4/6/8/12/16/24/32/48/64 asm opt

tests/checkasm/checkasm:           C       LSX     LASX
put_hevc_epel_uni_w_hv4_8_c:       9.5     2.2
put_hevc_epel_uni_w_hv6_8_c:       18.5    5.0     3.7
put_hevc_epel_uni_w_hv8_8_c:       30.7    6.0     4.5
put_hevc_epel_uni_w_hv12_8_c:      63.7    14.0    10.7
put_hevc_epel_uni_w_hv16_8_c:      107.5   22.7    17.0
put_hevc_epel_uni_w_hv24_8_c:      236.7   50.2    31.7
put_hevc_epel_uni_w_hv32_8_c:      414.5   88.0    53.0
put_hevc_epel_uni_w_hv48_8_c:      917.5   197.7   118.5
put_hevc_epel_uni_w_hv64_8_c:      1617.0  349.5   203.0

After this patch, the peformance of decoding H265 4K 30FPS 30Mbps
on 3A6000 with 8 threads improves 3fps (52fps-->55fsp).

Change-Id: If067e394cec4685c62193e7adb829ac93ba4804d
Reviewed-by: yinshiyou-hf@loongson.cn
Signed-off-by: Michael Niedermayer <michael@niedermayer.cc>
release/7.0
jinbo 1 year ago committed by Michael Niedermayer
parent 6c6bf18ce8
commit 1f642b99af
No known key found for this signature in database
GPG Key ID: B18E8928B3948D64
  1. 821
      libavcodec/loongarch/hevc_mc.S
  2. 19
      libavcodec/loongarch/hevcdsp_init_loongarch.c
  3. 9
      libavcodec/loongarch/hevcdsp_lasx.h
  4. 10
      libavcodec/loongarch/hevcdsp_lsx.h

@ -22,6 +22,7 @@
#include "loongson_asm.S"
.extern ff_hevc_qpel_filters
.extern ff_hevc_epel_filters
.macro LOAD_VAR bit
addi.w t1, a5, 6 //shift
@ -206,6 +207,12 @@
.endif
.endm
/*
* void FUNC(put_hevc_pel_uni_w_pixels)(uint8_t *_dst, ptrdiff_t _dststride,
* const uint8_t *_src, ptrdiff_t _srcstride,
* int height, int denom, int wx, int ox,
* intptr_t mx, intptr_t my, int width)
*/
function ff_hevc_put_hevc_pel_uni_w_pixels4_8_lsx
LOAD_VAR 128
srli.w t0, a4, 1
@ -482,6 +489,12 @@ endfunc
xvhaddw.d.w \in0, \in0, \in0
.endm
/*
* void FUNC(put_hevc_qpel_uni_w_v)(uint8_t *_dst, ptrdiff_t _dststride,
* const uint8_t *_src, ptrdiff_t _srcstride,
* int height, int denom, int wx, int ox,
* intptr_t mx, intptr_t my, int width)
*/
function ff_hevc_put_hevc_qpel_uni_w_v4_8_lsx
LOAD_VAR 128
ld.d t0, sp, 8 //my
@ -1253,6 +1266,12 @@ endfunc
xvssrani.bu.h \out0, xr11, 0
.endm
/*
* void FUNC(put_hevc_qpel_uni_w_h)(uint8_t *_dst, ptrdiff_t _dststride,
* const uint8_t *_src, ptrdiff_t _srcstride,
* int height, int denom, int wx, int ox,
* intptr_t mx, intptr_t my, int width)
*/
function ff_hevc_put_hevc_qpel_uni_w_h4_8_lsx
LOAD_VAR 128
ld.d t0, sp, 0 //mx
@ -1763,3 +1782,805 @@ function ff_hevc_put_hevc_qpel_uni_w_h64_8_lasx
addi.d a4, a4, -1
bnez a4, .LOOP_H64_LASX
endfunc
const shufb
.byte 0,1,2,3, 1,2,3,4 ,2,3,4,5, 3,4,5,6
.byte 4,5,6,7, 5,6,7,8 ,6,7,8,9, 7,8,9,10
endconst
.macro PUT_HEVC_EPEL_UNI_W_HV4_LSX w
fld.d f7, a2, 0 // start to load src
fldx.d f8, a2, a3
alsl.d a2, a3, a2, 1
fld.d f9, a2, 0
vshuf.b vr7, vr7, vr7, vr0 // 0123 1234 2345 3456
vshuf.b vr8, vr8, vr8, vr0
vshuf.b vr9, vr9, vr9, vr0
vdp2.h.bu.b vr10, vr7, vr5 // EPEL_FILTER(src, 1)
vdp2.h.bu.b vr11, vr8, vr5
vdp2.h.bu.b vr12, vr9, vr5
vhaddw.w.h vr10, vr10, vr10 // tmp[0/1/2/3]
vhaddw.w.h vr11, vr11, vr11 // vr10,vr11,vr12 corresponding to EPEL_EXTRA
vhaddw.w.h vr12, vr12, vr12
.LOOP_HV4_\w:
add.d a2, a2, a3
fld.d f14, a2, 0 // height loop begin
vshuf.b vr14, vr14, vr14, vr0
vdp2.h.bu.b vr13, vr14, vr5
vhaddw.w.h vr13, vr13, vr13
vmul.w vr14, vr10, vr16 // EPEL_FILTER(tmp, MAX_PB_SIZE)
vmadd.w vr14, vr11, vr17
vmadd.w vr14, vr12, vr18
vmadd.w vr14, vr13, vr19
vaddi.wu vr10, vr11, 0 //back up previous value
vaddi.wu vr11, vr12, 0
vaddi.wu vr12, vr13, 0
vsrai.w vr14, vr14, 6 // >> 6
vmul.w vr14, vr14, vr1 // * wx
vadd.w vr14, vr14, vr2 // + offset
vsra.w vr14, vr14, vr3 // >> shift
vadd.w vr14, vr14, vr4 // + ox
vssrani.h.w vr14, vr14, 0
vssrani.bu.h vr14, vr14, 0 // clip
fst.s f14, a0, 0
add.d a0, a0, a1
addi.d a4, a4, -1
bnez a4, .LOOP_HV4_\w
.endm
/*
* void FUNC(put_hevc_epel_uni_w_hv)(uint8_t *_dst, ptrdiff_t _dststride,
* const uint8_t *_src, ptrdiff_t _srcstride,
* int height, int denom, int wx, int ox,
* intptr_t mx, intptr_t my, int width)
*/
function ff_hevc_put_hevc_epel_uni_w_hv4_8_lsx
LOAD_VAR 128
ld.d t0, sp, 0 // mx
addi.d t0, t0, -1
slli.w t0, t0, 2
la.local t1, ff_hevc_epel_filters
vldx vr5, t1, t0 // ff_hevc_epel_filters[mx - 1];
vreplvei.w vr5, vr5, 0
ld.d t0, sp, 8 // my
addi.d t0, t0, -1
slli.w t0, t0, 2
vldx vr6, t1, t0 // ff_hevc_epel_filters[my - 1];
vsllwil.h.b vr6, vr6, 0
vsllwil.w.h vr6, vr6, 0
vreplvei.w vr16, vr6, 0
vreplvei.w vr17, vr6, 1
vreplvei.w vr18, vr6, 2
vreplvei.w vr19, vr6, 3
la.local t1, shufb
vld vr0, t1, 0
sub.d a2, a2, a3 // src -= srcstride
addi.d a2, a2, -1
PUT_HEVC_EPEL_UNI_W_HV4_LSX 4
endfunc
.macro PUT_HEVC_EPEL_UNI_W_HV8_LSX w
vld vr7, a2, 0 // start to load src
vldx vr8, a2, a3
alsl.d a2, a3, a2, 1
vld vr9, a2, 0
vshuf.b vr10, vr7, vr7, vr0 // 0123 1234 2345 3456
vshuf.b vr11, vr8, vr8, vr0
vshuf.b vr12, vr9, vr9, vr0
vshuf.b vr7, vr7, vr7, vr22// 4567 5678 6789 78910
vshuf.b vr8, vr8, vr8, vr22
vshuf.b vr9, vr9, vr9, vr22
vdp2.h.bu.b vr13, vr10, vr5 // EPEL_FILTER(src, 1)
vdp2.h.bu.b vr14, vr11, vr5
vdp2.h.bu.b vr15, vr12, vr5
vdp2.h.bu.b vr23, vr7, vr5
vdp2.h.bu.b vr20, vr8, vr5
vdp2.h.bu.b vr21, vr9, vr5
vhaddw.w.h vr7, vr13, vr13
vhaddw.w.h vr8, vr14, vr14
vhaddw.w.h vr9, vr15, vr15
vhaddw.w.h vr10, vr23, vr23
vhaddw.w.h vr11, vr20, vr20
vhaddw.w.h vr12, vr21, vr21
.LOOP_HV8_HORI_\w:
add.d a2, a2, a3
vld vr15, a2, 0
vshuf.b vr23, vr15, vr15, vr0
vshuf.b vr15, vr15, vr15, vr22
vdp2.h.bu.b vr13, vr23, vr5
vdp2.h.bu.b vr14, vr15, vr5
vhaddw.w.h vr13, vr13, vr13 //789--13
vhaddw.w.h vr14, vr14, vr14 //101112--14
vmul.w vr15, vr7, vr16 //EPEL_FILTER(tmp, MAX_PB_SIZE)
vmadd.w vr15, vr8, vr17
vmadd.w vr15, vr9, vr18
vmadd.w vr15, vr13, vr19
vmul.w vr20, vr10, vr16
vmadd.w vr20, vr11, vr17
vmadd.w vr20, vr12, vr18
vmadd.w vr20, vr14, vr19
vaddi.wu vr7, vr8, 0 //back up previous value
vaddi.wu vr8, vr9, 0
vaddi.wu vr9, vr13, 0
vaddi.wu vr10, vr11, 0
vaddi.wu vr11, vr12, 0
vaddi.wu vr12, vr14, 0
vsrai.w vr15, vr15, 6 // >> 6
vsrai.w vr20, vr20, 6
vmul.w vr15, vr15, vr1 // * wx
vmul.w vr20, vr20, vr1
vadd.w vr15, vr15, vr2 // + offset
vadd.w vr20, vr20, vr2
vsra.w vr15, vr15, vr3 // >> shift
vsra.w vr20, vr20, vr3
vadd.w vr15, vr15, vr4 // + ox
vadd.w vr20, vr20, vr4
vssrani.h.w vr20, vr15, 0
vssrani.bu.h vr20, vr20, 0
.if \w > 6
fst.d f20, a0, 0
.else
fst.s f20, a0, 0
vstelm.h vr20, a0, 4, 2
.endif
add.d a0, a0, a1
addi.d a4, a4, -1
bnez a4, .LOOP_HV8_HORI_\w
.endm
.macro PUT_HEVC_EPEL_UNI_W_HV8_LASX w
vld vr7, a2, 0 // start to load src
vldx vr8, a2, a3
alsl.d a2, a3, a2, 1
vld vr9, a2, 0
xvreplve0.q xr7, xr7
xvreplve0.q xr8, xr8
xvreplve0.q xr9, xr9
xvshuf.b xr10, xr7, xr7, xr0 // 0123 1234 2345 3456
xvshuf.b xr11, xr8, xr8, xr0
xvshuf.b xr12, xr9, xr9, xr0
xvdp2.h.bu.b xr13, xr10, xr5 // EPEL_FILTER(src, 1)
xvdp2.h.bu.b xr14, xr11, xr5
xvdp2.h.bu.b xr15, xr12, xr5
xvhaddw.w.h xr7, xr13, xr13
xvhaddw.w.h xr8, xr14, xr14
xvhaddw.w.h xr9, xr15, xr15
.LOOP_HV8_HORI_LASX_\w:
add.d a2, a2, a3
vld vr15, a2, 0
xvreplve0.q xr15, xr15
xvshuf.b xr23, xr15, xr15, xr0
xvdp2.h.bu.b xr10, xr23, xr5
xvhaddw.w.h xr10, xr10, xr10
xvmul.w xr15, xr7, xr16 //EPEL_FILTER(tmp, MAX_PB_SIZE)
xvmadd.w xr15, xr8, xr17
xvmadd.w xr15, xr9, xr18
xvmadd.w xr15, xr10, xr19
xvaddi.wu xr7, xr8, 0 //back up previous value
xvaddi.wu xr8, xr9, 0
xvaddi.wu xr9, xr10, 0
xvsrai.w xr15, xr15, 6 // >> 6
xvmul.w xr15, xr15, xr1 // * wx
xvadd.w xr15, xr15, xr2 // + offset
xvsra.w xr15, xr15, xr3 // >> shift
xvadd.w xr15, xr15, xr4 // + ox
xvpermi.q xr20, xr15, 0x01
vssrani.h.w vr20, vr15, 0
vssrani.bu.h vr20, vr20, 0
.if \w > 6
fst.d f20, a0, 0
.else
fst.s f20, a0, 0
vstelm.h vr20, a0, 4, 2
.endif
add.d a0, a0, a1
addi.d a4, a4, -1
bnez a4, .LOOP_HV8_HORI_LASX_\w
.endm
.macro PUT_HEVC_EPEL_UNI_W_HV16_LASX w
xvld xr7, a2, 0 // start to load src
xvldx xr8, a2, a3
alsl.d a2, a3, a2, 1
xvld xr9, a2, 0
xvpermi.d xr10, xr7, 0x09 //8..18
xvpermi.d xr11, xr8, 0x09
xvpermi.d xr12, xr9, 0x09
xvreplve0.q xr7, xr7
xvreplve0.q xr8, xr8
xvreplve0.q xr9, xr9
xvshuf.b xr13, xr7, xr7, xr0 // 0123 1234 2345 3456
xvshuf.b xr14, xr8, xr8, xr0
xvshuf.b xr15, xr9, xr9, xr0
xvdp2.h.bu.b xr20, xr13, xr5 // EPEL_FILTER(src, 1)
xvdp2.h.bu.b xr21, xr14, xr5
xvdp2.h.bu.b xr22, xr15, xr5
xvhaddw.w.h xr7, xr20, xr20
xvhaddw.w.h xr8, xr21, xr21
xvhaddw.w.h xr9, xr22, xr22
xvreplve0.q xr10, xr10
xvreplve0.q xr11, xr11
xvreplve0.q xr12, xr12
xvshuf.b xr13, xr10, xr10, xr0
xvshuf.b xr14, xr11, xr11, xr0
xvshuf.b xr15, xr12, xr12, xr0
xvdp2.h.bu.b xr20, xr13, xr5
xvdp2.h.bu.b xr21, xr14, xr5
xvdp2.h.bu.b xr22, xr15, xr5
xvhaddw.w.h xr10, xr20, xr20
xvhaddw.w.h xr11, xr21, xr21
xvhaddw.w.h xr12, xr22, xr22
.LOOP_HV16_HORI_LASX_\w:
add.d a2, a2, a3
xvld xr15, a2, 0
xvpermi.d xr20, xr15, 0x09 //8...18
xvreplve0.q xr15, xr15
xvreplve0.q xr20, xr20
xvshuf.b xr21, xr15, xr15, xr0
xvshuf.b xr22, xr20, xr20, xr0
xvdp2.h.bu.b xr13, xr21, xr5
xvdp2.h.bu.b xr14, xr22, xr5
xvhaddw.w.h xr13, xr13, xr13
xvhaddw.w.h xr14, xr14, xr14
xvmul.w xr15, xr7, xr16 //EPEL_FILTER(tmp, MAX_PB_SIZE)
xvmadd.w xr15, xr8, xr17
xvmadd.w xr15, xr9, xr18
xvmadd.w xr15, xr13, xr19
xvmul.w xr20, xr10, xr16
xvmadd.w xr20, xr11, xr17
xvmadd.w xr20, xr12, xr18
xvmadd.w xr20, xr14, xr19
xvaddi.wu xr7, xr8, 0 //back up previous value
xvaddi.wu xr8, xr9, 0
xvaddi.wu xr9, xr13, 0
xvaddi.wu xr10, xr11, 0
xvaddi.wu xr11, xr12, 0
xvaddi.wu xr12, xr14, 0
xvsrai.w xr15, xr15, 6 // >> 6
xvsrai.w xr20, xr20, 6 // >> 6
xvmul.w xr15, xr15, xr1 // * wx
xvmul.w xr20, xr20, xr1 // * wx
xvadd.w xr15, xr15, xr2 // + offset
xvadd.w xr20, xr20, xr2 // + offset
xvsra.w xr15, xr15, xr3 // >> shift
xvsra.w xr20, xr20, xr3 // >> shift
xvadd.w xr15, xr15, xr4 // + ox
xvadd.w xr20, xr20, xr4 // + ox
xvssrani.h.w xr20, xr15, 0
xvpermi.q xr21, xr20, 0x01
vssrani.bu.h vr21, vr20, 0
vpermi.w vr21, vr21, 0xd8
.if \w < 16
fst.d f21, a0, 0
vstelm.w vr21, a0, 8, 2
.else
vst vr21, a0, 0
.endif
add.d a0, a0, a1
addi.d a4, a4, -1
bnez a4, .LOOP_HV16_HORI_LASX_\w
.endm
function ff_hevc_put_hevc_epel_uni_w_hv6_8_lsx
LOAD_VAR 128
ld.d t0, sp, 0 // mx
addi.d t0, t0, -1
slli.w t0, t0, 2
la.local t1, ff_hevc_epel_filters
vldx vr5, t1, t0 // ff_hevc_epel_filters[mx - 1];
vreplvei.w vr5, vr5, 0
ld.d t0, sp, 8 // my
addi.d t0, t0, -1
slli.w t0, t0, 2
vldx vr6, t1, t0 // ff_hevc_epel_filters[my - 1];
vsllwil.h.b vr6, vr6, 0
vsllwil.w.h vr6, vr6, 0
vreplvei.w vr16, vr6, 0
vreplvei.w vr17, vr6, 1
vreplvei.w vr18, vr6, 2
vreplvei.w vr19, vr6, 3
la.local t1, shufb
vld vr0, t1, 0
vaddi.bu vr22, vr0, 4 // update shufb to get high part
sub.d a2, a2, a3 // src -= srcstride
addi.d a2, a2, -1
PUT_HEVC_EPEL_UNI_W_HV8_LSX 6
endfunc
function ff_hevc_put_hevc_epel_uni_w_hv6_8_lasx
LOAD_VAR 256
ld.d t0, sp, 0 // mx
addi.d t0, t0, -1
slli.w t0, t0, 2
la.local t1, ff_hevc_epel_filters
vldx vr5, t1, t0 // ff_hevc_epel_filters[mx - 1];
xvreplve0.w xr5, xr5
ld.d t0, sp, 8 // my
addi.d t0, t0, -1
slli.w t0, t0, 2
vldx vr6, t1, t0 // ff_hevc_epel_filters[my - 1];
vsllwil.h.b vr6, vr6, 0
vsllwil.w.h vr6, vr6, 0
xvreplve0.q xr6, xr6
xvrepl128vei.w xr16, xr6, 0
xvrepl128vei.w xr17, xr6, 1
xvrepl128vei.w xr18, xr6, 2
xvrepl128vei.w xr19, xr6, 3
la.local t1, shufb
xvld xr0, t1, 0
sub.d a2, a2, a3 // src -= srcstride
addi.d a2, a2, -1
PUT_HEVC_EPEL_UNI_W_HV8_LASX 6
endfunc
function ff_hevc_put_hevc_epel_uni_w_hv8_8_lsx
LOAD_VAR 128
ld.d t0, sp, 0 // mx
addi.d t0, t0, -1
slli.w t0, t0, 2
la.local t1, ff_hevc_epel_filters
vldx vr5, t1, t0 // ff_hevc_epel_filters[mx - 1];
vreplvei.w vr5, vr5, 0
ld.d t0, sp, 8 // my
addi.d t0, t0, -1
slli.w t0, t0, 2
vldx vr6, t1, t0 // ff_hevc_epel_filters[my - 1];
vsllwil.h.b vr6, vr6, 0
vsllwil.w.h vr6, vr6, 0
vreplvei.w vr16, vr6, 0
vreplvei.w vr17, vr6, 1
vreplvei.w vr18, vr6, 2
vreplvei.w vr19, vr6, 3
la.local t1, shufb
vld vr0, t1, 0
vaddi.bu vr22, vr0, 4 // update shufb to get high part
sub.d a2, a2, a3 // src -= srcstride
addi.d a2, a2, -1
PUT_HEVC_EPEL_UNI_W_HV8_LSX 8
endfunc
function ff_hevc_put_hevc_epel_uni_w_hv8_8_lasx
LOAD_VAR 256
ld.d t0, sp, 0 // mx
addi.d t0, t0, -1
slli.w t0, t0, 2
la.local t1, ff_hevc_epel_filters
vldx vr5, t1, t0 // ff_hevc_epel_filters[mx - 1];
xvreplve0.w xr5, xr5
ld.d t0, sp, 8 // my
addi.d t0, t0, -1
slli.w t0, t0, 2
vldx vr6, t1, t0 // ff_hevc_epel_filters[my - 1];
vsllwil.h.b vr6, vr6, 0
vsllwil.w.h vr6, vr6, 0
xvreplve0.q xr6, xr6
xvrepl128vei.w xr16, xr6, 0
xvrepl128vei.w xr17, xr6, 1
xvrepl128vei.w xr18, xr6, 2
xvrepl128vei.w xr19, xr6, 3
la.local t1, shufb
xvld xr0, t1, 0
sub.d a2, a2, a3 // src -= srcstride
addi.d a2, a2, -1
PUT_HEVC_EPEL_UNI_W_HV8_LASX 8
endfunc
function ff_hevc_put_hevc_epel_uni_w_hv12_8_lsx
LOAD_VAR 128
ld.d t0, sp, 0 // mx
addi.d t0, t0, -1
slli.w t0, t0, 2
la.local t1, ff_hevc_epel_filters
vldx vr5, t1, t0 // ff_hevc_epel_filters[mx - 1];
vreplvei.w vr5, vr5, 0
ld.d t0, sp, 8 // my
addi.d t0, t0, -1
slli.w t0, t0, 2
vldx vr6, t1, t0 // ff_hevc_epel_filters[my - 1];
vsllwil.h.b vr6, vr6, 0
vsllwil.w.h vr6, vr6, 0
vreplvei.w vr16, vr6, 0
vreplvei.w vr17, vr6, 1
vreplvei.w vr18, vr6, 2
vreplvei.w vr19, vr6, 3
la.local t1, shufb
vld vr0, t1, 0
vaddi.bu vr22, vr0, 4 // update shufb to get high part
sub.d a2, a2, a3 // src -= srcstride
addi.d a2, a2, -1
addi.d t2, a0, 0
addi.d t3, a2, 0
addi.d t4, a4, 0
PUT_HEVC_EPEL_UNI_W_HV8_LSX 12
addi.d a0, t2, 8
addi.d a2, t3, 8
addi.d a4, t4, 0
PUT_HEVC_EPEL_UNI_W_HV4_LSX 12
endfunc
function ff_hevc_put_hevc_epel_uni_w_hv12_8_lasx
LOAD_VAR 256
ld.d t0, sp, 0 // mx
addi.d t0, t0, -1
slli.w t0, t0, 2
la.local t1, ff_hevc_epel_filters
vldx vr5, t1, t0 // ff_hevc_epel_filters[mx - 1];
xvreplve0.w xr5, xr5
ld.d t0, sp, 8 // my
addi.d t0, t0, -1
slli.w t0, t0, 2
vldx vr6, t1, t0 // ff_hevc_epel_filters[my - 1];
vsllwil.h.b vr6, vr6, 0
vsllwil.w.h vr6, vr6, 0
xvreplve0.q xr6, xr6
xvrepl128vei.w xr16, xr6, 0
xvrepl128vei.w xr17, xr6, 1
xvrepl128vei.w xr18, xr6, 2
xvrepl128vei.w xr19, xr6, 3
la.local t1, shufb
xvld xr0, t1, 0
sub.d a2, a2, a3 // src -= srcstride
addi.d a2, a2, -1
PUT_HEVC_EPEL_UNI_W_HV16_LASX 12
endfunc
function ff_hevc_put_hevc_epel_uni_w_hv16_8_lsx
LOAD_VAR 128
ld.d t0, sp, 0 // mx
addi.d t0, t0, -1
slli.w t0, t0, 2
la.local t1, ff_hevc_epel_filters
vldx vr5, t1, t0 // ff_hevc_epel_filters[mx - 1];
vreplvei.w vr5, vr5, 0
ld.d t0, sp, 8 // my
addi.d t0, t0, -1
slli.w t0, t0, 2
vldx vr6, t1, t0 // ff_hevc_epel_filters[my - 1];
vsllwil.h.b vr6, vr6, 0
vsllwil.w.h vr6, vr6, 0
vreplvei.w vr16, vr6, 0
vreplvei.w vr17, vr6, 1
vreplvei.w vr18, vr6, 2
vreplvei.w vr19, vr6, 3
la.local t1, shufb
vld vr0, t1, 0
vaddi.bu vr22, vr0, 4 // update shufb to get high part
sub.d a2, a2, a3 // src -= srcstride
addi.d a2, a2, -1
addi.d t2, a0, 0
addi.d t3, a2, 0
addi.d t4, a4, 0
addi.d t5, zero, 2
.LOOP_HV16:
PUT_HEVC_EPEL_UNI_W_HV8_LSX 16
addi.d a0, t2, 8
addi.d a2, t3, 8
addi.d a4, t4, 0
addi.d t5, t5, -1
bnez t5, .LOOP_HV16
endfunc
function ff_hevc_put_hevc_epel_uni_w_hv16_8_lasx
LOAD_VAR 256
ld.d t0, sp, 0 // mx
addi.d t0, t0, -1
slli.w t0, t0, 2
la.local t1, ff_hevc_epel_filters
vldx vr5, t1, t0 // ff_hevc_epel_filters[mx - 1];
xvreplve0.w xr5, xr5
ld.d t0, sp, 8 // my
addi.d t0, t0, -1
slli.w t0, t0, 2
vldx vr6, t1, t0 // ff_hevc_epel_filters[my - 1];
vsllwil.h.b vr6, vr6, 0
vsllwil.w.h vr6, vr6, 0
xvreplve0.q xr6, xr6
xvrepl128vei.w xr16, xr6, 0
xvrepl128vei.w xr17, xr6, 1
xvrepl128vei.w xr18, xr6, 2
xvrepl128vei.w xr19, xr6, 3
la.local t1, shufb
xvld xr0, t1, 0
sub.d a2, a2, a3 // src -= srcstride
addi.d a2, a2, -1
PUT_HEVC_EPEL_UNI_W_HV16_LASX 16
endfunc
function ff_hevc_put_hevc_epel_uni_w_hv24_8_lsx
LOAD_VAR 128
ld.d t0, sp, 0 // mx
addi.d t0, t0, -1
slli.w t0, t0, 2
la.local t1, ff_hevc_epel_filters
vldx vr5, t1, t0 // ff_hevc_epel_filters[mx - 1];
vreplvei.w vr5, vr5, 0
ld.d t0, sp, 8 // my
addi.d t0, t0, -1
slli.w t0, t0, 2
vldx vr6, t1, t0 // ff_hevc_epel_filters[my - 1];
vsllwil.h.b vr6, vr6, 0
vsllwil.w.h vr6, vr6, 0
vreplvei.w vr16, vr6, 0
vreplvei.w vr17, vr6, 1
vreplvei.w vr18, vr6, 2
vreplvei.w vr19, vr6, 3
la.local t1, shufb
vld vr0, t1, 0
vaddi.bu vr22, vr0, 4 // update shufb to get high part
sub.d a2, a2, a3 // src -= srcstride
addi.d a2, a2, -1
addi.d t2, a0, 0
addi.d t3, a2, 0
addi.d t4, a4, 0
addi.d t5, zero, 3
.LOOP_HV24:
PUT_HEVC_EPEL_UNI_W_HV8_LSX 24
addi.d a0, t2, 8
addi.d t2, t2, 8
addi.d a2, t3, 8
addi.d t3, t3, 8
addi.d a4, t4, 0
addi.d t5, t5, -1
bnez t5, .LOOP_HV24
endfunc
function ff_hevc_put_hevc_epel_uni_w_hv24_8_lasx
LOAD_VAR 256
ld.d t0, sp, 0 // mx
addi.d t0, t0, -1
slli.w t0, t0, 2
la.local t1, ff_hevc_epel_filters
vldx vr5, t1, t0 // ff_hevc_epel_filters[mx - 1];
xvreplve0.w xr5, xr5
ld.d t0, sp, 8 // my
addi.d t0, t0, -1
slli.w t0, t0, 2
vldx vr6, t1, t0 // ff_hevc_epel_filters[my - 1];
vsllwil.h.b vr6, vr6, 0
vsllwil.w.h vr6, vr6, 0
xvreplve0.q xr6, xr6
xvrepl128vei.w xr16, xr6, 0
xvrepl128vei.w xr17, xr6, 1
xvrepl128vei.w xr18, xr6, 2
xvrepl128vei.w xr19, xr6, 3
la.local t1, shufb
xvld xr0, t1, 0
sub.d a2, a2, a3 // src -= srcstride
addi.d a2, a2, -1
addi.d t2, a0, 0
addi.d t3, a2, 0
addi.d t4, a4, 0
PUT_HEVC_EPEL_UNI_W_HV16_LASX 24
addi.d a0, t2, 16
addi.d a2, t3, 16
addi.d a4, t4, 0
PUT_HEVC_EPEL_UNI_W_HV8_LASX 24
endfunc
function ff_hevc_put_hevc_epel_uni_w_hv32_8_lsx
LOAD_VAR 128
ld.d t0, sp, 0 // mx
addi.d t0, t0, -1
slli.w t0, t0, 2
la.local t1, ff_hevc_epel_filters
vldx vr5, t1, t0 // ff_hevc_epel_filters[mx - 1];
vreplvei.w vr5, vr5, 0
ld.d t0, sp, 8 // my
addi.d t0, t0, -1
slli.w t0, t0, 2
vldx vr6, t1, t0 // ff_hevc_epel_filters[my - 1];
vsllwil.h.b vr6, vr6, 0
vsllwil.w.h vr6, vr6, 0
vreplvei.w vr16, vr6, 0
vreplvei.w vr17, vr6, 1
vreplvei.w vr18, vr6, 2
vreplvei.w vr19, vr6, 3
la.local t1, shufb
vld vr0, t1, 0
vaddi.bu vr22, vr0, 4 // update shufb to get high part
sub.d a2, a2, a3 // src -= srcstride
addi.d a2, a2, -1
addi.d t2, a0, 0
addi.d t3, a2, 0
addi.d t4, a4, 0
addi.d t5, zero, 4
.LOOP_HV32:
PUT_HEVC_EPEL_UNI_W_HV8_LSX 32
addi.d a0, t2, 8
addi.d t2, t2, 8
addi.d a2, t3, 8
addi.d t3, t3, 8
addi.d a4, t4, 0
addi.d t5, t5, -1
bnez t5, .LOOP_HV32
endfunc
function ff_hevc_put_hevc_epel_uni_w_hv32_8_lasx
LOAD_VAR 256
ld.d t0, sp, 0 // mx
addi.d t0, t0, -1
slli.w t0, t0, 2
la.local t1, ff_hevc_epel_filters
vldx vr5, t1, t0 // ff_hevc_epel_filters[mx - 1];
xvreplve0.w xr5, xr5
ld.d t0, sp, 8 // my
addi.d t0, t0, -1
slli.w t0, t0, 2
vldx vr6, t1, t0 // ff_hevc_epel_filters[my - 1];
vsllwil.h.b vr6, vr6, 0
vsllwil.w.h vr6, vr6, 0
xvreplve0.q xr6, xr6
xvrepl128vei.w xr16, xr6, 0
xvrepl128vei.w xr17, xr6, 1
xvrepl128vei.w xr18, xr6, 2
xvrepl128vei.w xr19, xr6, 3
la.local t1, shufb
xvld xr0, t1, 0
sub.d a2, a2, a3 // src -= srcstride
addi.d a2, a2, -1
addi.d t2, a0, 0
addi.d t3, a2, 0
addi.d t4, a4, 0
addi.d t5, zero, 2
.LOOP_HV32_LASX:
PUT_HEVC_EPEL_UNI_W_HV16_LASX 32
addi.d a0, t2, 16
addi.d t2, t2, 16
addi.d a2, t3, 16
addi.d t3, t3, 16
addi.d a4, t4, 0
addi.d t5, t5, -1
bnez t5, .LOOP_HV32_LASX
endfunc
function ff_hevc_put_hevc_epel_uni_w_hv48_8_lsx
LOAD_VAR 128
ld.d t0, sp, 0 // mx
addi.d t0, t0, -1
slli.w t0, t0, 2
la.local t1, ff_hevc_epel_filters
vldx vr5, t1, t0 // ff_hevc_epel_filters[mx - 1];
vreplvei.w vr5, vr5, 0
ld.d t0, sp, 8 // my
addi.d t0, t0, -1
slli.w t0, t0, 2
vldx vr6, t1, t0 // ff_hevc_epel_filters[my - 1];
vsllwil.h.b vr6, vr6, 0
vsllwil.w.h vr6, vr6, 0
vreplvei.w vr16, vr6, 0
vreplvei.w vr17, vr6, 1
vreplvei.w vr18, vr6, 2
vreplvei.w vr19, vr6, 3
la.local t1, shufb
vld vr0, t1, 0
vaddi.bu vr22, vr0, 4 // update shufb to get high part
sub.d a2, a2, a3 // src -= srcstride
addi.d a2, a2, -1
addi.d t2, a0, 0
addi.d t3, a2, 0
addi.d t4, a4, 0
addi.d t5, zero, 6
.LOOP_HV48:
PUT_HEVC_EPEL_UNI_W_HV8_LSX 48
addi.d a0, t2, 8
addi.d t2, t2, 8
addi.d a2, t3, 8
addi.d t3, t3, 8
addi.d a4, t4, 0
addi.d t5, t5, -1
bnez t5, .LOOP_HV48
endfunc
function ff_hevc_put_hevc_epel_uni_w_hv48_8_lasx
LOAD_VAR 256
ld.d t0, sp, 0 // mx
addi.d t0, t0, -1
slli.w t0, t0, 2
la.local t1, ff_hevc_epel_filters
vldx vr5, t1, t0 // ff_hevc_epel_filters[mx - 1];
xvreplve0.w xr5, xr5
ld.d t0, sp, 8 // my
addi.d t0, t0, -1
slli.w t0, t0, 2
vldx vr6, t1, t0 // ff_hevc_epel_filters[my - 1];
vsllwil.h.b vr6, vr6, 0
vsllwil.w.h vr6, vr6, 0
xvreplve0.q xr6, xr6
xvrepl128vei.w xr16, xr6, 0
xvrepl128vei.w xr17, xr6, 1
xvrepl128vei.w xr18, xr6, 2
xvrepl128vei.w xr19, xr6, 3
la.local t1, shufb
xvld xr0, t1, 0
sub.d a2, a2, a3 // src -= srcstride
addi.d a2, a2, -1
addi.d t2, a0, 0
addi.d t3, a2, 0
addi.d t4, a4, 0
addi.d t5, zero, 3
.LOOP_HV48_LASX:
PUT_HEVC_EPEL_UNI_W_HV16_LASX 48
addi.d a0, t2, 16
addi.d t2, t2, 16
addi.d a2, t3, 16
addi.d t3, t3, 16
addi.d a4, t4, 0
addi.d t5, t5, -1
bnez t5, .LOOP_HV48_LASX
endfunc
function ff_hevc_put_hevc_epel_uni_w_hv64_8_lsx
LOAD_VAR 128
ld.d t0, sp, 0 // mx
addi.d t0, t0, -1
slli.w t0, t0, 2
la.local t1, ff_hevc_epel_filters
vldx vr5, t1, t0 // ff_hevc_epel_filters[mx - 1];
vreplvei.w vr5, vr5, 0
ld.d t0, sp, 8 // my
addi.d t0, t0, -1
slli.w t0, t0, 2
vldx vr6, t1, t0 // ff_hevc_epel_filters[my - 1];
vsllwil.h.b vr6, vr6, 0
vsllwil.w.h vr6, vr6, 0
vreplvei.w vr16, vr6, 0
vreplvei.w vr17, vr6, 1
vreplvei.w vr18, vr6, 2
vreplvei.w vr19, vr6, 3
la.local t1, shufb
vld vr0, t1, 0
vaddi.bu vr22, vr0, 4 // update shufb to get high part
sub.d a2, a2, a3 // src -= srcstride
addi.d a2, a2, -1
addi.d t2, a0, 0
addi.d t3, a2, 0
addi.d t4, a4, 0
addi.d t5, zero, 8
.LOOP_HV64:
PUT_HEVC_EPEL_UNI_W_HV8_LSX 64
addi.d a0, t2, 8
addi.d t2, t2, 8
addi.d a2, t3, 8
addi.d t3, t3, 8
addi.d a4, t4, 0
addi.d t5, t5, -1
bnez t5, .LOOP_HV64
endfunc
function ff_hevc_put_hevc_epel_uni_w_hv64_8_lasx
LOAD_VAR 256
ld.d t0, sp, 0 // mx
addi.d t0, t0, -1
slli.w t0, t0, 2
la.local t1, ff_hevc_epel_filters
vldx vr5, t1, t0 // ff_hevc_epel_filters[mx - 1];
xvreplve0.w xr5, xr5
ld.d t0, sp, 8 // my
addi.d t0, t0, -1
slli.w t0, t0, 2
vldx vr6, t1, t0 // ff_hevc_epel_filters[my - 1];
vsllwil.h.b vr6, vr6, 0
vsllwil.w.h vr6, vr6, 0
xvreplve0.q xr6, xr6
xvrepl128vei.w xr16, xr6, 0
xvrepl128vei.w xr17, xr6, 1
xvrepl128vei.w xr18, xr6, 2
xvrepl128vei.w xr19, xr6, 3
la.local t1, shufb
xvld xr0, t1, 0
sub.d a2, a2, a3 // src -= srcstride
addi.d a2, a2, -1
addi.d t2, a0, 0
addi.d t3, a2, 0
addi.d t4, a4, 0
addi.d t5, zero, 4
.LOOP_HV64_LASX:
PUT_HEVC_EPEL_UNI_W_HV16_LASX 64
addi.d a0, t2, 16
addi.d t2, t2, 16
addi.d a2, t3, 16
addi.d t3, t3, 16
addi.d a4, t4, 0
addi.d t5, t5, -1
bnez t5, .LOOP_HV64_LASX
endfunc

@ -171,6 +171,16 @@ void ff_hevc_dsp_init_loongarch(HEVCDSPContext *c, const int bit_depth)
c->put_hevc_qpel_uni_w[8][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels48_8_lsx;
c->put_hevc_qpel_uni_w[9][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels64_8_lsx;
c->put_hevc_epel_uni_w[1][1][1] = ff_hevc_put_hevc_epel_uni_w_hv4_8_lsx;
c->put_hevc_epel_uni_w[2][1][1] = ff_hevc_put_hevc_epel_uni_w_hv6_8_lsx;
c->put_hevc_epel_uni_w[3][1][1] = ff_hevc_put_hevc_epel_uni_w_hv8_8_lsx;
c->put_hevc_epel_uni_w[4][1][1] = ff_hevc_put_hevc_epel_uni_w_hv12_8_lsx;
c->put_hevc_epel_uni_w[5][1][1] = ff_hevc_put_hevc_epel_uni_w_hv16_8_lsx;
c->put_hevc_epel_uni_w[6][1][1] = ff_hevc_put_hevc_epel_uni_w_hv24_8_lsx;
c->put_hevc_epel_uni_w[7][1][1] = ff_hevc_put_hevc_epel_uni_w_hv32_8_lsx;
c->put_hevc_epel_uni_w[8][1][1] = ff_hevc_put_hevc_epel_uni_w_hv48_8_lsx;
c->put_hevc_epel_uni_w[9][1][1] = ff_hevc_put_hevc_epel_uni_w_hv64_8_lsx;
c->put_hevc_epel_uni_w[1][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels4_8_lsx;
c->put_hevc_epel_uni_w[2][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels6_8_lsx;
c->put_hevc_epel_uni_w[3][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels8_8_lsx;
@ -258,6 +268,15 @@ void ff_hevc_dsp_init_loongarch(HEVCDSPContext *c, const int bit_depth)
c->put_hevc_epel_uni_w[8][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels48_8_lasx;
c->put_hevc_epel_uni_w[9][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels64_8_lasx;
c->put_hevc_epel_uni_w[2][1][1] = ff_hevc_put_hevc_epel_uni_w_hv6_8_lasx;
c->put_hevc_epel_uni_w[3][1][1] = ff_hevc_put_hevc_epel_uni_w_hv8_8_lasx;
c->put_hevc_epel_uni_w[4][1][1] = ff_hevc_put_hevc_epel_uni_w_hv12_8_lasx;
c->put_hevc_epel_uni_w[5][1][1] = ff_hevc_put_hevc_epel_uni_w_hv16_8_lasx;
c->put_hevc_epel_uni_w[6][1][1] = ff_hevc_put_hevc_epel_uni_w_hv24_8_lasx;
c->put_hevc_epel_uni_w[7][1][1] = ff_hevc_put_hevc_epel_uni_w_hv32_8_lasx;
c->put_hevc_epel_uni_w[8][1][1] = ff_hevc_put_hevc_epel_uni_w_hv48_8_lasx;
c->put_hevc_epel_uni_w[9][1][1] = ff_hevc_put_hevc_epel_uni_w_hv64_8_lasx;
c->put_hevc_qpel_uni_w[3][1][0] = ff_hevc_put_hevc_qpel_uni_w_v8_8_lasx;
c->put_hevc_qpel_uni_w[4][1][0] = ff_hevc_put_hevc_qpel_uni_w_v12_8_lasx;
c->put_hevc_qpel_uni_w[5][1][0] = ff_hevc_put_hevc_qpel_uni_w_v16_8_lasx;

@ -66,6 +66,15 @@ PEL_UNI_W(qpel, h, 32);
PEL_UNI_W(qpel, h, 48);
PEL_UNI_W(qpel, h, 64);
PEL_UNI_W(epel, hv, 6);
PEL_UNI_W(epel, hv, 8);
PEL_UNI_W(epel, hv, 12);
PEL_UNI_W(epel, hv, 16);
PEL_UNI_W(epel, hv, 24);
PEL_UNI_W(epel, hv, 32);
PEL_UNI_W(epel, hv, 48);
PEL_UNI_W(epel, hv, 64);
#undef PEL_UNI_W
#endif // #ifndef AVCODEC_LOONGARCH_HEVCDSP_LASX_H

@ -277,6 +277,16 @@ PEL_UNI_W(qpel, h, 32);
PEL_UNI_W(qpel, h, 48);
PEL_UNI_W(qpel, h, 64);
PEL_UNI_W(epel, hv, 4);
PEL_UNI_W(epel, hv, 6);
PEL_UNI_W(epel, hv, 8);
PEL_UNI_W(epel, hv, 12);
PEL_UNI_W(epel, hv, 16);
PEL_UNI_W(epel, hv, 24);
PEL_UNI_W(epel, hv, 32);
PEL_UNI_W(epel, hv, 48);
PEL_UNI_W(epel, hv, 64);
#undef PEL_UNI_W
#endif // #ifndef AVCODEC_LOONGARCH_HEVCDSP_LSX_H

Loading…
Cancel
Save