mirror of https://github.com/FFmpeg/FFmpeg.git
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
857 lines
30 KiB
857 lines
30 KiB
/* |
|
* Copyright (c) 2023 Loongson Technology Corporation Limited |
|
* Contributed by Hecai Yuan <yuanhecai@loongson.cn> |
|
* |
|
* This file is part of FFmpeg. |
|
* |
|
* FFmpeg is free software; you can redistribute it and/or |
|
* modify it under the terms of the GNU Lesser General Public |
|
* License as published by the Free Software Foundation; either |
|
* version 2.1 of the License, or (at your option) any later version. |
|
* |
|
* FFmpeg is distributed in the hope that it will be useful, |
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
|
* Lesser General Public License for more details. |
|
* |
|
* You should have received a copy of the GNU Lesser General Public |
|
* License along with FFmpeg; if not, write to the Free Software |
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
|
*/ |
|
|
|
#include "loongson_asm.S" |
|
|
|
.macro fr_store |
|
addi.d sp, sp, -64 |
|
fst.d f24, sp, 0 |
|
fst.d f25, sp, 8 |
|
fst.d f26, sp, 16 |
|
fst.d f27, sp, 24 |
|
fst.d f28, sp, 32 |
|
fst.d f29, sp, 40 |
|
fst.d f30, sp, 48 |
|
fst.d f31, sp, 56 |
|
.endm |
|
|
|
.macro fr_recover |
|
fld.d f24, sp, 0 |
|
fld.d f25, sp, 8 |
|
fld.d f26, sp, 16 |
|
fld.d f27, sp, 24 |
|
fld.d f28, sp, 32 |
|
fld.d f29, sp, 40 |
|
fld.d f30, sp, 48 |
|
fld.d f31, sp, 56 |
|
addi.d sp, sp, 64 |
|
.endm |
|
|
|
.extern gt32x32_cnst1 |
|
|
|
.extern gt32x32_cnst2 |
|
|
|
.extern gt8x8_cnst |
|
|
|
.extern gt32x32_cnst0 |
|
|
|
.macro idct_16x32_step1_lasx |
|
xvldrepl.w xr20, t1, 0 |
|
xvldrepl.w xr21, t1, 4 |
|
xvldrepl.w xr22, t1, 8 |
|
xvldrepl.w xr23, t1, 12 |
|
|
|
xvmulwev.w.h xr16, xr8, xr20 |
|
xvmaddwod.w.h xr16, xr8, xr20 |
|
xvmulwev.w.h xr17, xr9, xr20 |
|
xvmaddwod.w.h xr17, xr9, xr20 |
|
|
|
xvmaddwev.w.h xr16, xr10, xr21 |
|
xvmaddwod.w.h xr16, xr10, xr21 |
|
xvmaddwev.w.h xr17, xr11, xr21 |
|
xvmaddwod.w.h xr17, xr11, xr21 |
|
|
|
xvmaddwev.w.h xr16, xr12, xr22 |
|
xvmaddwod.w.h xr16, xr12, xr22 |
|
xvmaddwev.w.h xr17, xr13, xr22 |
|
xvmaddwod.w.h xr17, xr13, xr22 |
|
|
|
xvmaddwev.w.h xr16, xr14, xr23 |
|
xvmaddwod.w.h xr16, xr14, xr23 |
|
xvmaddwev.w.h xr17, xr15, xr23 |
|
xvmaddwod.w.h xr17, xr15, xr23 |
|
|
|
xvld xr0, t2, 0 |
|
xvld xr1, t2, 32 |
|
|
|
xvadd.w xr18, xr0, xr16 |
|
xvadd.w xr19, xr1, xr17 |
|
xvsub.w xr0, xr0, xr16 |
|
xvsub.w xr1, xr1, xr17 |
|
|
|
xvst xr18, t2, 0 |
|
xvst xr19, t2, 32 |
|
xvst xr0, t3, 0 |
|
xvst xr1, t3, 32 |
|
.endm |
|
|
|
.macro idct_16x32_step2_lasx in0, in1, in2, in3, in4, in5, in6, in7, out0, out1 |
|
|
|
xvldrepl.w xr20, t1, 0 |
|
xvldrepl.w xr21, t1, 4 |
|
xvldrepl.w xr22, t1, 8 |
|
xvldrepl.w xr23, t1, 12 |
|
|
|
xvmulwev.w.h \out0, \in0, xr20 |
|
xvmaddwod.w.h \out0, \in0, xr20 |
|
xvmulwev.w.h \out1, \in1, xr20 |
|
xvmaddwod.w.h \out1, \in1, xr20 |
|
xvmaddwev.w.h \out0, \in2, xr21 |
|
xvmaddwod.w.h \out0, \in2, xr21 |
|
xvmaddwev.w.h \out1, \in3, xr21 |
|
xvmaddwod.w.h \out1, \in3, xr21 |
|
xvmaddwev.w.h \out0, \in4, xr22 |
|
xvmaddwod.w.h \out0, \in4, xr22 |
|
xvmaddwev.w.h \out1, \in5, xr22 |
|
xvmaddwod.w.h \out1, \in5, xr22 |
|
xvmaddwev.w.h \out0, \in6, xr23 |
|
xvmaddwod.w.h \out0, \in6, xr23 |
|
xvmaddwev.w.h \out1, \in7, xr23 // sum0_r |
|
xvmaddwod.w.h \out1, \in7, xr23 // sum0_l |
|
.endm |
|
|
|
/* loop for all columns of filter constants */ |
|
.macro idct_16x32_step3_lasx round |
|
xvadd.w xr16, xr16, xr30 |
|
xvadd.w xr17, xr17, xr31 |
|
|
|
xvld xr0, t2, 0 |
|
xvld xr1, t2, 32 |
|
|
|
xvadd.w xr30, xr0, xr16 |
|
xvadd.w xr31, xr1, xr17 |
|
xvsub.w xr16, xr0, xr16 |
|
xvsub.w xr17, xr1, xr17 |
|
xvssrarni.h.w xr31, xr30, \round |
|
xvssrarni.h.w xr17, xr16, \round |
|
xvst xr31, t4, 0 |
|
xvst xr17, t5, 0 |
|
.endm |
|
|
|
.macro idct_16x32_lasx buf_pitch, round |
|
addi.d t2, sp, 64 |
|
|
|
addi.d t0, a0, \buf_pitch*4*2 |
|
|
|
// 4 12 20 28 |
|
xvld xr0, t0, 0 |
|
xvld xr1, t0, \buf_pitch*8*2 |
|
xvld xr2, t0, \buf_pitch*16*2 |
|
xvld xr3, t0, \buf_pitch*24*2 |
|
|
|
xvilvl.h xr10, xr1, xr0 |
|
xvilvh.h xr11, xr1, xr0 |
|
xvilvl.h xr12, xr3, xr2 |
|
xvilvh.h xr13, xr3, xr2 |
|
|
|
la.local t1, gt32x32_cnst2 |
|
|
|
xvldrepl.w xr20, t1, 0 |
|
xvldrepl.w xr21, t1, 4 |
|
xvmulwev.w.h xr14, xr10, xr20 |
|
xvmaddwod.w.h xr14, xr10, xr20 |
|
xvmulwev.w.h xr15, xr11, xr20 |
|
xvmaddwod.w.h xr15, xr11, xr20 |
|
xvmaddwev.w.h xr14, xr12, xr21 |
|
xvmaddwod.w.h xr14, xr12, xr21 |
|
xvmaddwev.w.h xr15, xr13, xr21 |
|
xvmaddwod.w.h xr15, xr13, xr21 |
|
|
|
xvldrepl.w xr20, t1, 8 |
|
xvldrepl.w xr21, t1, 12 |
|
xvmulwev.w.h xr16, xr10, xr20 |
|
xvmaddwod.w.h xr16, xr10, xr20 |
|
xvmulwev.w.h xr17, xr11, xr20 |
|
xvmaddwod.w.h xr17, xr11, xr20 |
|
xvmaddwev.w.h xr16, xr12, xr21 |
|
xvmaddwod.w.h xr16, xr12, xr21 |
|
xvmaddwev.w.h xr17, xr13, xr21 |
|
xvmaddwod.w.h xr17, xr13, xr21 |
|
|
|
xvldrepl.w xr20, t1, 16 |
|
xvldrepl.w xr21, t1, 20 |
|
xvmulwev.w.h xr18, xr10, xr20 |
|
xvmaddwod.w.h xr18, xr10, xr20 |
|
xvmulwev.w.h xr19, xr11, xr20 |
|
xvmaddwod.w.h xr19, xr11, xr20 |
|
xvmaddwev.w.h xr18, xr12, xr21 |
|
xvmaddwod.w.h xr18, xr12, xr21 |
|
xvmaddwev.w.h xr19, xr13, xr21 |
|
xvmaddwod.w.h xr19, xr13, xr21 |
|
|
|
xvldrepl.w xr20, t1, 24 |
|
xvldrepl.w xr21, t1, 28 |
|
xvmulwev.w.h xr22, xr10, xr20 |
|
xvmaddwod.w.h xr22, xr10, xr20 |
|
xvmulwev.w.h xr23, xr11, xr20 |
|
xvmaddwod.w.h xr23, xr11, xr20 |
|
xvmaddwev.w.h xr22, xr12, xr21 |
|
xvmaddwod.w.h xr22, xr12, xr21 |
|
xvmaddwev.w.h xr23, xr13, xr21 |
|
xvmaddwod.w.h xr23, xr13, xr21 |
|
|
|
/* process coeff 0, 8, 16, 24 */ |
|
la.local t1, gt8x8_cnst |
|
|
|
xvld xr0, a0, 0 |
|
xvld xr1, a0, \buf_pitch*8*2 |
|
xvld xr2, a0, \buf_pitch*16*2 |
|
xvld xr3, a0, \buf_pitch*24*2 |
|
|
|
xvldrepl.w xr20, t1, 0 |
|
xvldrepl.w xr21, t1, 4 |
|
|
|
xvilvl.h xr10, xr2, xr0 |
|
xvilvh.h xr11, xr2, xr0 |
|
xvilvl.h xr12, xr3, xr1 |
|
xvilvh.h xr13, xr3, xr1 |
|
|
|
xvmulwev.w.h xr4, xr10, xr20 |
|
xvmaddwod.w.h xr4, xr10, xr20 // sum0_r |
|
xvmulwev.w.h xr5, xr11, xr20 |
|
xvmaddwod.w.h xr5, xr11, xr20 // sum0_l |
|
xvmulwev.w.h xr6, xr12, xr21 |
|
xvmaddwod.w.h xr6, xr12, xr21 // tmp1_r |
|
xvmulwev.w.h xr7, xr13, xr21 |
|
xvmaddwod.w.h xr7, xr13, xr21 // tmp1_l |
|
|
|
xvsub.w xr0, xr4, xr6 // sum1_r |
|
xvadd.w xr1, xr4, xr6 // sum0_r |
|
xvsub.w xr2, xr5, xr7 // sum1_l |
|
xvadd.w xr3, xr5, xr7 // sum0_l |
|
|
|
// HEVC_EVEN16_CALC |
|
xvsub.w xr24, xr1, xr14 // 7 |
|
xvsub.w xr25, xr3, xr15 |
|
xvadd.w xr14, xr1, xr14 // 0 |
|
xvadd.w xr15, xr3, xr15 |
|
xvst xr24, t2, 7*16*4 // 448=16*28=7*16*4 |
|
xvst xr25, t2, 7*16*4+32 // 480 |
|
xvst xr14, t2, 0 |
|
xvst xr15, t2, 32 |
|
|
|
xvsub.w xr26, xr0, xr22 // 4 |
|
xvsub.w xr27, xr2, xr23 |
|
xvadd.w xr22, xr0, xr22 // 3 |
|
xvadd.w xr23, xr2, xr23 |
|
xvst xr26, t2, 4*16*4 // 256=4*16*4 |
|
xvst xr27, t2, 4*16*4+32 // 288 |
|
xvst xr22, t2, 3*16*4 // 192=3*16*4 |
|
xvst xr23, t2, 3*16*4+32 // 224 |
|
|
|
xvldrepl.w xr20, t1, 16 |
|
xvldrepl.w xr21, t1, 20 |
|
|
|
xvmulwev.w.h xr4, xr10, xr20 |
|
xvmaddwod.w.h xr4, xr10, xr20 |
|
xvmulwev.w.h xr5, xr11, xr20 |
|
xvmaddwod.w.h xr5, xr11, xr20 |
|
xvmulwev.w.h xr6, xr12, xr21 |
|
xvmaddwod.w.h xr6, xr12, xr21 |
|
xvmulwev.w.h xr7, xr13, xr21 |
|
xvmaddwod.w.h xr7, xr13, xr21 |
|
|
|
xvsub.w xr0, xr4, xr6 // sum1_r |
|
xvadd.w xr1, xr4, xr6 // sum0_r |
|
xvsub.w xr2, xr5, xr7 // sum1_l |
|
xvadd.w xr3, xr5, xr7 // sum0_l |
|
|
|
// HEVC_EVEN16_CALC |
|
xvsub.w xr24, xr1, xr16 // 6 |
|
xvsub.w xr25, xr3, xr17 |
|
xvadd.w xr16, xr1, xr16 // 1 |
|
xvadd.w xr17, xr3, xr17 |
|
xvst xr24, t2, 6*16*4 // 384=6*16*4 |
|
xvst xr25, t2, 6*16*4+32 // 416 |
|
xvst xr16, t2, 1*16*4 // 64=1*16*4 |
|
xvst xr17, t2, 1*16*4+32 // 96 |
|
|
|
xvsub.w xr26, xr0, xr18 // 5 |
|
xvsub.w xr27, xr2, xr19 |
|
xvadd.w xr18, xr0, xr18 // 2 |
|
xvadd.w xr19, xr2, xr19 |
|
xvst xr26, t2, 5*16*4 // 320=5*16*4 |
|
xvst xr27, t2, 5*16*4+32 // 352 |
|
xvst xr18, t2, 2*16*4 // 128=2*16*4 |
|
xvst xr19, t2, 2*16*4+32 // 160 |
|
|
|
/* process coeff 2 6 10 14 18 22 26 30 */ |
|
addi.d t0, a0, \buf_pitch*2*2 |
|
|
|
xvld xr0, t0, 0 |
|
xvld xr1, t0, \buf_pitch*4*2 |
|
xvld xr2, t0, \buf_pitch*8*2 |
|
xvld xr3, t0, \buf_pitch*12*2 |
|
|
|
xvld xr4, t0, \buf_pitch*16*2 |
|
xvld xr5, t0, \buf_pitch*20*2 |
|
xvld xr6, t0, \buf_pitch*24*2 |
|
xvld xr7, t0, \buf_pitch*28*2 |
|
|
|
xvilvl.h xr8, xr1, xr0 |
|
xvilvh.h xr9, xr1, xr0 |
|
xvilvl.h xr10, xr3, xr2 |
|
xvilvh.h xr11, xr3, xr2 |
|
xvilvl.h xr12, xr5, xr4 |
|
xvilvh.h xr13, xr5, xr4 |
|
xvilvl.h xr14, xr7, xr6 |
|
xvilvh.h xr15, xr7, xr6 |
|
|
|
la.local t1, gt32x32_cnst1 |
|
|
|
addi.d t2, sp, 64 |
|
addi.d t3, sp, 64+960 // 30*32 |
|
|
|
idct_16x32_step1_lasx |
|
|
|
.rept 7 |
|
addi.d t1, t1, 16 |
|
addi.d t2, t2, 64 |
|
addi.d t3, t3, -64 |
|
idct_16x32_step1_lasx |
|
.endr |
|
|
|
addi.d t0, a0, \buf_pitch*2 |
|
|
|
xvld xr0, t0, 0 |
|
xvld xr1, t0, \buf_pitch*2*2 |
|
xvld xr2, t0, \buf_pitch*4*2 |
|
xvld xr3, t0, \buf_pitch*6*2 |
|
xvld xr4, t0, \buf_pitch*8*2 |
|
xvld xr5, t0, \buf_pitch*10*2 |
|
xvld xr6, t0, \buf_pitch*12*2 |
|
xvld xr7, t0, \buf_pitch*14*2 |
|
|
|
xvilvl.h xr8, xr1, xr0 |
|
xvilvh.h xr9, xr1, xr0 |
|
xvilvl.h xr10, xr3, xr2 |
|
xvilvh.h xr11, xr3, xr2 |
|
xvilvl.h xr12, xr5, xr4 |
|
xvilvh.h xr13, xr5, xr4 |
|
xvilvl.h xr14, xr7, xr6 |
|
xvilvh.h xr15, xr7, xr6 |
|
|
|
la.local t1, gt32x32_cnst0 |
|
|
|
idct_16x32_step2_lasx xr8, xr9, xr10, xr11, xr12, xr13, \ |
|
xr14, xr15, xr16, xr17 |
|
|
|
addi.d t0, a0, \buf_pitch*16*2+\buf_pitch*2 |
|
|
|
xvld xr0, t0, 0 |
|
xvld xr1, t0, \buf_pitch*2*2 |
|
xvld xr2, t0, \buf_pitch*4*2 |
|
xvld xr3, t0, \buf_pitch*6*2 |
|
xvld xr4, t0, \buf_pitch*8*2 |
|
xvld xr5, t0, \buf_pitch*10*2 |
|
xvld xr6, t0, \buf_pitch*12*2 |
|
xvld xr7, t0, \buf_pitch*14*2 |
|
|
|
xvilvl.h xr18, xr1, xr0 |
|
xvilvh.h xr19, xr1, xr0 |
|
xvilvl.h xr24, xr3, xr2 |
|
xvilvh.h xr25, xr3, xr2 |
|
xvilvl.h xr26, xr5, xr4 |
|
xvilvh.h xr27, xr5, xr4 |
|
xvilvl.h xr28, xr7, xr6 |
|
xvilvh.h xr29, xr7, xr6 |
|
|
|
addi.d t1, t1, 16 |
|
idct_16x32_step2_lasx xr18, xr19, xr24, xr25, xr26, xr27, \ |
|
xr28, xr29, xr30, xr31 |
|
|
|
addi.d t4, a0, 0 |
|
addi.d t5, a0, \buf_pitch*31*2 |
|
addi.d t2, sp, 64 |
|
|
|
idct_16x32_step3_lasx \round |
|
|
|
.rept 15 |
|
|
|
addi.d t1, t1, 16 |
|
idct_16x32_step2_lasx xr8, xr9, xr10, xr11, xr12, xr13, \ |
|
xr14, xr15, xr16, xr17 |
|
|
|
addi.d t1, t1, 16 |
|
idct_16x32_step2_lasx xr18, xr19, xr24, xr25, xr26, xr27, \ |
|
xr28, xr29, xr30, xr31 |
|
|
|
addi.d t2, t2, 64 |
|
addi.d t4, t4, \buf_pitch*2 |
|
addi.d t5, t5, -\buf_pitch*2 |
|
|
|
idct_16x32_step3_lasx \round |
|
.endr |
|
|
|
.endm |
|
|
|
function hevc_idct_16x32_column_step1_lasx |
|
addi.d sp, sp, -1600 // 64+512*3 |
|
fr_store |
|
|
|
idct_16x32_lasx 32, 7 |
|
|
|
fr_recover |
|
addi.d sp, sp, 1600 |
|
endfunc |
|
|
|
function hevc_idct_16x32_column_step2_lasx |
|
addi.d sp, sp, -1600 // 64+512*3 |
|
fr_store |
|
|
|
idct_16x32_lasx 16, 12 |
|
|
|
fr_recover |
|
addi.d sp, sp, 1600 |
|
endfunc |
|
|
|
function hevc_idct_transpose_32x16_to_16x32_lasx |
|
fr_store |
|
|
|
xvld xr0, a0, 0 |
|
xvld xr1, a0, 64 |
|
xvld xr2, a0, 128 |
|
xvld xr3, a0, 192 |
|
xvld xr4, a0, 256 |
|
xvld xr5, a0, 320 |
|
xvld xr6, a0, 384 |
|
xvld xr7, a0, 448 |
|
|
|
xvpermi.q xr8, xr0, 0x01 |
|
xvpermi.q xr9, xr1, 0x01 |
|
xvpermi.q xr10, xr2, 0x01 |
|
xvpermi.q xr11, xr3, 0x01 |
|
xvpermi.q xr12, xr4, 0x01 |
|
xvpermi.q xr13, xr5, 0x01 |
|
xvpermi.q xr14, xr6, 0x01 |
|
xvpermi.q xr15, xr7, 0x01 |
|
|
|
LSX_TRANSPOSE8x8_H vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ |
|
vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ |
|
vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23 |
|
|
|
LSX_TRANSPOSE8x8_H vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, \ |
|
vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, \ |
|
vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23 |
|
|
|
addi.d a0, a0, 512 |
|
|
|
vld vr24, a0, 0 |
|
vld vr25, a0, 64 |
|
vld vr26, a0, 128 |
|
vld vr27, a0, 192 |
|
vld vr28, a0, 256 |
|
vld vr29, a0, 320 |
|
vld vr30, a0, 384 |
|
vld vr31, a0, 448 |
|
|
|
LSX_TRANSPOSE8x8_H vr24, vr25, vr26, vr27, vr28, vr29, vr30, vr31, \ |
|
vr24, vr25, vr26, vr27, vr28, vr29, vr30, vr31, \ |
|
vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23 |
|
|
|
xvpermi.q xr0, xr24, 0x02 |
|
xvpermi.q xr1, xr25, 0x02 |
|
xvpermi.q xr2, xr26, 0x02 |
|
xvpermi.q xr3, xr27, 0x02 |
|
xvpermi.q xr4, xr28, 0x02 |
|
xvpermi.q xr5, xr29, 0x02 |
|
xvpermi.q xr6, xr30, 0x02 |
|
xvpermi.q xr7, xr31, 0x02 |
|
|
|
xvst xr0, a1, 0 |
|
xvst xr1, a1, 32 |
|
xvst xr2, a1, 64 |
|
xvst xr3, a1, 96 |
|
xvst xr4, a1, 128 |
|
xvst xr5, a1, 160 |
|
xvst xr6, a1, 192 |
|
xvst xr7, a1, 224 |
|
|
|
addi.d a1, a1, 256 |
|
addi.d a0, a0, 16 |
|
|
|
vld vr24, a0, 0 |
|
vld vr25, a0, 64 |
|
vld vr26, a0, 128 |
|
vld vr27, a0, 192 |
|
vld vr28, a0, 256 |
|
vld vr29, a0, 320 |
|
vld vr30, a0, 384 |
|
vld vr31, a0, 448 |
|
|
|
LSX_TRANSPOSE8x8_H vr24, vr25, vr26, vr27, vr28, vr29, vr30, vr31, \ |
|
vr24, vr25, vr26, vr27, vr28, vr29, vr30, vr31, \ |
|
vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23 |
|
|
|
xvpermi.q xr8, xr24, 0x02 |
|
xvpermi.q xr9, xr25, 0x02 |
|
xvpermi.q xr10, xr26, 0x02 |
|
xvpermi.q xr11, xr27, 0x02 |
|
xvpermi.q xr12, xr28, 0x02 |
|
xvpermi.q xr13, xr29, 0x02 |
|
xvpermi.q xr14, xr30, 0x02 |
|
xvpermi.q xr15, xr31, 0x02 |
|
|
|
xvst xr8, a1, 0 |
|
xvst xr9, a1, 32 |
|
xvst xr10, a1, 64 |
|
xvst xr11, a1, 96 |
|
xvst xr12, a1, 128 |
|
xvst xr13, a1, 160 |
|
xvst xr14, a1, 192 |
|
xvst xr15, a1, 224 |
|
|
|
// second |
|
addi.d a0, a0, 32-512-16 |
|
|
|
xvld xr0, a0, 0 |
|
xvld xr1, a0, 64 |
|
xvld xr2, a0, 128 |
|
xvld xr3, a0, 192 |
|
xvld xr4, a0, 256 |
|
xvld xr5, a0, 320 |
|
xvld xr6, a0, 384 |
|
xvld xr7, a0, 448 |
|
|
|
xvpermi.q xr8, xr0, 0x01 |
|
xvpermi.q xr9, xr1, 0x01 |
|
xvpermi.q xr10, xr2, 0x01 |
|
xvpermi.q xr11, xr3, 0x01 |
|
xvpermi.q xr12, xr4, 0x01 |
|
xvpermi.q xr13, xr5, 0x01 |
|
xvpermi.q xr14, xr6, 0x01 |
|
xvpermi.q xr15, xr7, 0x01 |
|
|
|
LSX_TRANSPOSE8x8_H vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ |
|
vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ |
|
vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23 |
|
|
|
LSX_TRANSPOSE8x8_H vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, \ |
|
vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, \ |
|
vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23 |
|
|
|
addi.d a0, a0, 512 |
|
|
|
vld vr24, a0, 0 |
|
vld vr25, a0, 64 |
|
vld vr26, a0, 128 |
|
vld vr27, a0, 192 |
|
vld vr28, a0, 256 |
|
vld vr29, a0, 320 |
|
vld vr30, a0, 384 |
|
vld vr31, a0, 448 |
|
|
|
LSX_TRANSPOSE8x8_H vr24, vr25, vr26, vr27, vr28, vr29, vr30, vr31, \ |
|
vr24, vr25, vr26, vr27, vr28, vr29, vr30, vr31, \ |
|
vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23 |
|
|
|
xvpermi.q xr0, xr24, 0x02 |
|
xvpermi.q xr1, xr25, 0x02 |
|
xvpermi.q xr2, xr26, 0x02 |
|
xvpermi.q xr3, xr27, 0x02 |
|
xvpermi.q xr4, xr28, 0x02 |
|
xvpermi.q xr5, xr29, 0x02 |
|
xvpermi.q xr6, xr30, 0x02 |
|
xvpermi.q xr7, xr31, 0x02 |
|
|
|
addi.d a1, a1, 256 |
|
xvst xr0, a1, 0 |
|
xvst xr1, a1, 32 |
|
xvst xr2, a1, 64 |
|
xvst xr3, a1, 96 |
|
xvst xr4, a1, 128 |
|
xvst xr5, a1, 160 |
|
xvst xr6, a1, 192 |
|
xvst xr7, a1, 224 |
|
|
|
addi.d a1, a1, 256 |
|
addi.d a0, a0, 16 |
|
|
|
vld vr24, a0, 0 |
|
vld vr25, a0, 64 |
|
vld vr26, a0, 128 |
|
vld vr27, a0, 192 |
|
vld vr28, a0, 256 |
|
vld vr29, a0, 320 |
|
vld vr30, a0, 384 |
|
vld vr31, a0, 448 |
|
|
|
LSX_TRANSPOSE8x8_H vr24, vr25, vr26, vr27, vr28, vr29, vr30, vr31, \ |
|
vr24, vr25, vr26, vr27, vr28, vr29, vr30, vr31, \ |
|
vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23 |
|
|
|
xvpermi.q xr8, xr24, 0x02 |
|
xvpermi.q xr9, xr25, 0x02 |
|
xvpermi.q xr10, xr26, 0x02 |
|
xvpermi.q xr11, xr27, 0x02 |
|
xvpermi.q xr12, xr28, 0x02 |
|
xvpermi.q xr13, xr29, 0x02 |
|
xvpermi.q xr14, xr30, 0x02 |
|
xvpermi.q xr15, xr31, 0x02 |
|
|
|
xvst xr8, a1, 0 |
|
xvst xr9, a1, 32 |
|
xvst xr10, a1, 64 |
|
xvst xr11, a1, 96 |
|
xvst xr12, a1, 128 |
|
xvst xr13, a1, 160 |
|
xvst xr14, a1, 192 |
|
xvst xr15, a1, 224 |
|
|
|
fr_recover |
|
endfunc |
|
|
|
function hevc_idct_transpose_16x32_to_32x16_lasx |
|
fr_store |
|
|
|
xvld xr0, a0, 0 |
|
xvld xr1, a0, 32 |
|
xvld xr2, a0, 64 |
|
xvld xr3, a0, 96 |
|
xvld xr4, a0, 128 |
|
xvld xr5, a0, 160 |
|
xvld xr6, a0, 192 |
|
xvld xr7, a0, 224 |
|
|
|
xvpermi.q xr8, xr0, 0x01 |
|
xvpermi.q xr9, xr1, 0x01 |
|
xvpermi.q xr10, xr2, 0x01 |
|
xvpermi.q xr11, xr3, 0x01 |
|
xvpermi.q xr12, xr4, 0x01 |
|
xvpermi.q xr13, xr5, 0x01 |
|
xvpermi.q xr14, xr6, 0x01 |
|
xvpermi.q xr15, xr7, 0x01 |
|
|
|
LSX_TRANSPOSE8x8_H vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ |
|
vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ |
|
vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23 |
|
|
|
LSX_TRANSPOSE8x8_H vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, \ |
|
vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, \ |
|
vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23 |
|
|
|
addi.d a0, a0, 256 |
|
|
|
vld vr24, a0, 0 |
|
vld vr25, a0, 32 |
|
vld vr26, a0, 64 |
|
vld vr27, a0, 96 |
|
vld vr28, a0, 128 |
|
vld vr29, a0, 160 |
|
vld vr30, a0, 192 |
|
vld vr31, a0, 224 |
|
|
|
LSX_TRANSPOSE8x8_H vr24, vr25, vr26, vr27, vr28, vr29, vr30, vr31, \ |
|
vr24, vr25, vr26, vr27, vr28, vr29, vr30, vr31, \ |
|
vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23 |
|
|
|
xvpermi.q xr0, xr24, 0x02 |
|
xvpermi.q xr1, xr25, 0x02 |
|
xvpermi.q xr2, xr26, 0x02 |
|
xvpermi.q xr3, xr27, 0x02 |
|
xvpermi.q xr4, xr28, 0x02 |
|
xvpermi.q xr5, xr29, 0x02 |
|
xvpermi.q xr6, xr30, 0x02 |
|
xvpermi.q xr7, xr31, 0x02 |
|
|
|
xvst xr0, a1, 0 |
|
xvst xr1, a1, 64 |
|
xvst xr2, a1, 128 |
|
xvst xr3, a1, 192 |
|
xvst xr4, a1, 256 |
|
xvst xr5, a1, 320 |
|
xvst xr6, a1, 384 |
|
xvst xr7, a1, 448 |
|
|
|
addi.d a1, a1, 512 |
|
addi.d a0, a0, 16 |
|
|
|
vld vr24, a0, 0 |
|
vld vr25, a0, 32 |
|
vld vr26, a0, 64 |
|
vld vr27, a0, 96 |
|
vld vr28, a0, 128 |
|
vld vr29, a0, 160 |
|
vld vr30, a0, 192 |
|
vld vr31, a0, 224 |
|
|
|
LSX_TRANSPOSE8x8_H vr24, vr25, vr26, vr27, vr28, vr29, vr30, vr31, \ |
|
vr24, vr25, vr26, vr27, vr28, vr29, vr30, vr31, \ |
|
vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23 |
|
|
|
xvpermi.q xr8, xr24, 0x02 |
|
xvpermi.q xr9, xr25, 0x02 |
|
xvpermi.q xr10, xr26, 0x02 |
|
xvpermi.q xr11, xr27, 0x02 |
|
xvpermi.q xr12, xr28, 0x02 |
|
xvpermi.q xr13, xr29, 0x02 |
|
xvpermi.q xr14, xr30, 0x02 |
|
xvpermi.q xr15, xr31, 0x02 |
|
|
|
xvst xr8, a1, 0 |
|
xvst xr9, a1, 64 |
|
xvst xr10, a1, 128 |
|
xvst xr11, a1, 192 |
|
xvst xr12, a1, 256 |
|
xvst xr13, a1, 320 |
|
xvst xr14, a1, 384 |
|
xvst xr15, a1, 448 |
|
|
|
// second |
|
addi.d a0, a0, 256-16 |
|
|
|
xvld xr0, a0, 0 |
|
xvld xr1, a0, 32 |
|
xvld xr2, a0, 64 |
|
xvld xr3, a0, 96 |
|
xvld xr4, a0, 128 |
|
xvld xr5, a0, 160 |
|
xvld xr6, a0, 192 |
|
xvld xr7, a0, 224 |
|
|
|
xvpermi.q xr8, xr0, 0x01 |
|
xvpermi.q xr9, xr1, 0x01 |
|
xvpermi.q xr10, xr2, 0x01 |
|
xvpermi.q xr11, xr3, 0x01 |
|
xvpermi.q xr12, xr4, 0x01 |
|
xvpermi.q xr13, xr5, 0x01 |
|
xvpermi.q xr14, xr6, 0x01 |
|
xvpermi.q xr15, xr7, 0x01 |
|
|
|
LSX_TRANSPOSE8x8_H vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ |
|
vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ |
|
vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23 |
|
|
|
LSX_TRANSPOSE8x8_H vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, \ |
|
vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, \ |
|
vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23 |
|
|
|
addi.d a0, a0, 256 |
|
|
|
vld vr24, a0, 0 |
|
vld vr25, a0, 32 |
|
vld vr26, a0, 64 |
|
vld vr27, a0, 96 |
|
vld vr28, a0, 128 |
|
vld vr29, a0, 160 |
|
vld vr30, a0, 192 |
|
vld vr31, a0, 224 |
|
|
|
LSX_TRANSPOSE8x8_H vr24, vr25, vr26, vr27, vr28, vr29, vr30, vr31, \ |
|
vr24, vr25, vr26, vr27, vr28, vr29, vr30, vr31, \ |
|
vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23 |
|
|
|
xvpermi.q xr0, xr24, 0x02 |
|
xvpermi.q xr1, xr25, 0x02 |
|
xvpermi.q xr2, xr26, 0x02 |
|
xvpermi.q xr3, xr27, 0x02 |
|
xvpermi.q xr4, xr28, 0x02 |
|
xvpermi.q xr5, xr29, 0x02 |
|
xvpermi.q xr6, xr30, 0x02 |
|
xvpermi.q xr7, xr31, 0x02 |
|
|
|
addi.d a1, a1, -512+32 |
|
|
|
xvst xr0, a1, 0 |
|
xvst xr1, a1, 64 |
|
xvst xr2, a1, 128 |
|
xvst xr3, a1, 192 |
|
xvst xr4, a1, 256 |
|
xvst xr5, a1, 320 |
|
xvst xr6, a1, 384 |
|
xvst xr7, a1, 448 |
|
|
|
addi.d a1, a1, 512 |
|
addi.d a0, a0, 16 |
|
|
|
vld vr24, a0, 0 |
|
vld vr25, a0, 32 |
|
vld vr26, a0, 64 |
|
vld vr27, a0, 96 |
|
vld vr28, a0, 128 |
|
vld vr29, a0, 160 |
|
vld vr30, a0, 192 |
|
vld vr31, a0, 224 |
|
|
|
LSX_TRANSPOSE8x8_H vr24, vr25, vr26, vr27, vr28, vr29, vr30, vr31, \ |
|
vr24, vr25, vr26, vr27, vr28, vr29, vr30, vr31, \ |
|
vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23 |
|
|
|
xvpermi.q xr8, xr24, 0x02 |
|
xvpermi.q xr9, xr25, 0x02 |
|
xvpermi.q xr10, xr26, 0x02 |
|
xvpermi.q xr11, xr27, 0x02 |
|
xvpermi.q xr12, xr28, 0x02 |
|
xvpermi.q xr13, xr29, 0x02 |
|
xvpermi.q xr14, xr30, 0x02 |
|
xvpermi.q xr15, xr31, 0x02 |
|
|
|
xvst xr8, a1, 0 |
|
xvst xr9, a1, 64 |
|
xvst xr10, a1, 128 |
|
xvst xr11, a1, 192 |
|
xvst xr12, a1, 256 |
|
xvst xr13, a1, 320 |
|
xvst xr14, a1, 384 |
|
xvst xr15, a1, 448 |
|
|
|
fr_recover |
|
endfunc |
|
|
|
function ff_hevc_idct_32x32_lasx |
|
|
|
addi.d t7, a0, 0 |
|
addi.d t6, a1, 0 |
|
|
|
addi.d sp, sp, -8 |
|
st.d ra, sp, 0 |
|
|
|
bl hevc_idct_16x32_column_step1_lasx |
|
|
|
addi.d a0, a0, 32 |
|
|
|
bl hevc_idct_16x32_column_step1_lasx |
|
|
|
addi.d sp, sp, -1086 // (16*32+31)*2 |
|
fr_store |
|
|
|
addi.d t8, sp, 64+31*2 // tmp_buf_ptr |
|
|
|
addi.d a0, t7, 0 |
|
addi.d a1, t8, 0 |
|
bl hevc_idct_transpose_32x16_to_16x32_lasx |
|
|
|
addi.d a0, t8, 0 |
|
bl hevc_idct_16x32_column_step2_lasx |
|
|
|
addi.d a0, t8, 0 |
|
addi.d a1, t7, 0 |
|
bl hevc_idct_transpose_16x32_to_32x16_lasx |
|
|
|
// second |
|
addi.d a0, t7, 32*8*2*2 |
|
addi.d a1, t8, 0 |
|
bl hevc_idct_transpose_32x16_to_16x32_lasx |
|
|
|
addi.d a0, t8, 0 |
|
bl hevc_idct_16x32_column_step2_lasx |
|
|
|
addi.d a0, t8, 0 |
|
addi.d a1, t7, 32*8*2*2 |
|
bl hevc_idct_transpose_16x32_to_32x16_lasx |
|
|
|
fr_recover |
|
addi.d sp, sp, 1086 // (16*32+31)*2 |
|
|
|
ld.d ra, sp, 0 |
|
addi.d sp, sp, 8 |
|
|
|
endfunc
|
|
|