mirror of https://github.com/FFmpeg/FFmpeg.git
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
658 lines
23 KiB
658 lines
23 KiB
/* |
|
* Loongson LASX optimized h264idct |
|
* |
|
* Copyright (c) 2023 Loongson Technology Corporation Limited |
|
* Contributed by Shiyou Yin <yinshiyou-hf@loongson.cn> |
|
* |
|
* This file is part of FFmpeg. |
|
* |
|
* FFmpeg is free software; you can redistribute it and/or |
|
* modify it under the terms of the GNU Lesser General Public |
|
* License as published by the Free Software Foundation; either |
|
* version 2.1 of the License, or (at your option) any later version. |
|
* |
|
* FFmpeg is distributed in the hope that it will be useful, |
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
|
* Lesser General Public License for more details. |
|
* |
|
* You should have received a copy of the GNU Lesser General Public |
|
* License along with FFmpeg; if not, write to the Free Software |
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
|
*/ |
|
|
|
#include "loongson_asm.S" |
|
|
|
/* |
|
* #define FUNC2(a, b, c) FUNC3(a, b, c) |
|
* #define FUNCC(a) FUNC2(a, BIT_DEPTH, _c) |
|
* void FUNCC(ff_h264_idct_add)(uint8_t *_dst, int16_t *_block, int stride) |
|
* LSX optimization is enough for this function. |
|
*/ |
|
function ff_h264_idct_add_8_lsx |
|
fld.d f0, a1, 0 |
|
fld.d f1, a1, 8 |
|
fld.d f2, a1, 16 |
|
fld.d f3, a1, 24 |
|
vxor.v vr7, vr7, vr7 |
|
add.d t2, a2, a2 |
|
add.d t3, t2, a2 |
|
vst vr7, a1, 0 |
|
vst vr7, a1, 16 |
|
|
|
vadd.h vr4, vr0, vr2 |
|
vsub.h vr5, vr0, vr2 |
|
vsrai.h vr6, vr1, 1 |
|
vsrai.h vr7, vr3, 1 |
|
vsub.h vr6, vr6, vr3 |
|
vadd.h vr7, vr1, vr7 |
|
LSX_BUTTERFLY_4_H vr4, vr5, vr6, vr7, vr0, vr1, vr2, vr3 |
|
LSX_TRANSPOSE4x4_H vr0, vr1, vr2, vr3, vr0, vr1, vr2, vr3, vr4, vr5 |
|
vadd.h vr4, vr0, vr2 |
|
vsub.h vr5, vr0, vr2 |
|
vsrai.h vr6, vr1, 1 |
|
vsrai.h vr7, vr3, 1 |
|
vsub.h vr6, vr6, vr3 |
|
vadd.h vr7, vr1, vr7 |
|
LSX_BUTTERFLY_4_H vr4, vr5, vr6, vr7, vr0, vr1, vr2, vr3 |
|
|
|
fld.s f4, a0, 0 |
|
fldx.s f5, a0, a2 |
|
fldx.s f6, a0, t2 |
|
fldx.s f7, a0, t3 |
|
|
|
vsrari.h vr0, vr0, 6 |
|
vsrari.h vr1, vr1, 6 |
|
vsrari.h vr2, vr2, 6 |
|
vsrari.h vr3, vr3, 6 |
|
|
|
vsllwil.hu.bu vr4, vr4, 0 |
|
vsllwil.hu.bu vr5, vr5, 0 |
|
vsllwil.hu.bu vr6, vr6, 0 |
|
vsllwil.hu.bu vr7, vr7, 0 |
|
vadd.h vr0, vr0, vr4 |
|
vadd.h vr1, vr1, vr5 |
|
vadd.h vr2, vr2, vr6 |
|
vadd.h vr3, vr3, vr7 |
|
vssrarni.bu.h vr1, vr0, 0 |
|
vssrarni.bu.h vr3, vr2, 0 |
|
|
|
vbsrl.v vr0, vr1, 8 |
|
vbsrl.v vr2, vr3, 8 |
|
fst.s f1, a0, 0 |
|
fstx.s f0, a0, a2 |
|
fstx.s f3, a0, t2 |
|
fstx.s f2, a0, t3 |
|
endfunc |
|
|
|
/* |
|
* #define FUNC2(a, b, c) FUNC3(a, b, c) |
|
* #define FUNCC(a) FUNC2(a, BIT_DEPTH, _c) |
|
* void FUNCC(ff_h264_idct8_add)(uint8_t *_dst, int16_t *_block, int stride) |
|
*/ |
|
function ff_h264_idct8_add_8_lsx |
|
ld.h t0, a1, 0 |
|
add.d t2, a2, a2 |
|
add.d t3, t2, a2 |
|
add.d t4, t3, a2 |
|
add.d t5, t4, a2 |
|
add.d t6, t5, a2 |
|
add.d t7, t6, a2 |
|
addi.w t0, t0, 32 |
|
st.h t0, a1, 0 |
|
|
|
vld vr0, a1, 0 |
|
vld vr1, a1, 16 |
|
vld vr2, a1, 32 |
|
vld vr3, a1, 48 |
|
vld vr4, a1, 64 |
|
vld vr5, a1, 80 |
|
vld vr6, a1, 96 |
|
vld vr7, a1, 112 |
|
vxor.v vr8, vr8, vr8 |
|
vst vr8, a1, 0 |
|
vst vr8, a1, 16 |
|
vst vr8, a1, 32 |
|
vst vr8, a1, 48 |
|
vst vr8, a1, 64 |
|
vst vr8, a1, 80 |
|
vst vr8, a1, 96 |
|
vst vr8, a1, 112 |
|
|
|
vadd.h vr18, vr0, vr4 |
|
vsub.h vr19, vr0, vr4 |
|
vsrai.h vr20, vr2, 1 |
|
vsrai.h vr21, vr6, 1 |
|
vsub.h vr20, vr20, vr6 |
|
vadd.h vr21, vr21, vr2 |
|
LSX_BUTTERFLY_4_H vr18, vr19, vr20, vr21, vr10, vr12, vr14, vr16 |
|
vsrai.h vr11, vr7, 1 |
|
vsrai.h vr13, vr3, 1 |
|
vsrai.h vr15, vr5, 1 |
|
vsrai.h vr17, vr1, 1 |
|
vsub.h vr11, vr5, vr11 |
|
vsub.h vr13, vr7, vr13 |
|
vadd.h vr15, vr7, vr15 |
|
vadd.h vr17, vr5, vr17 |
|
vsub.h vr11, vr11, vr7 |
|
vsub.h vr13, vr13, vr3 |
|
vadd.h vr15, vr15, vr5 |
|
vadd.h vr17, vr17, vr1 |
|
vsub.h vr11, vr11, vr3 |
|
vadd.h vr13, vr13, vr1 |
|
vsub.h vr15, vr15, vr1 |
|
vadd.h vr17, vr17, vr3 |
|
vsrai.h vr18, vr11, 2 |
|
vsrai.h vr19, vr13, 2 |
|
vsrai.h vr20, vr15, 2 |
|
vsrai.h vr21, vr17, 2 |
|
vadd.h vr11, vr11, vr21 |
|
vadd.h vr13, vr13, vr20 |
|
vsub.h vr15, vr19, vr15 |
|
vsub.h vr17, vr17, vr18 |
|
LSX_BUTTERFLY_8_H vr10, vr16, vr12, vr14, vr13, vr15, vr11, vr17, \ |
|
vr0, vr3, vr1, vr2, vr5, vr6, vr4, vr7 |
|
|
|
LSX_TRANSPOSE8x8_H vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ |
|
vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ |
|
vr10, vr11, vr12, vr13, vr14, vr15, vr16, vr17 |
|
vexth.w.h vr20, vr0 |
|
vexth.w.h vr21, vr1 |
|
vexth.w.h vr22, vr2 |
|
vexth.w.h vr23, vr3 |
|
vexth.w.h vr8, vr4 |
|
vexth.w.h vr9, vr5 |
|
vexth.w.h vr18, vr6 |
|
vexth.w.h vr19, vr7 |
|
vsllwil.w.h vr0, vr0, 0 |
|
vsllwil.w.h vr1, vr1, 0 |
|
vsllwil.w.h vr2, vr2, 0 |
|
vsllwil.w.h vr3, vr3, 0 |
|
vsllwil.w.h vr4, vr4, 0 |
|
vsllwil.w.h vr5, vr5, 0 |
|
vsllwil.w.h vr6, vr6, 0 |
|
vsllwil.w.h vr7, vr7, 0 |
|
|
|
vadd.w vr11, vr0, vr4 |
|
vsub.w vr13, vr0, vr4 |
|
vsrai.w vr15, vr2, 1 |
|
vsrai.w vr17, vr6, 1 |
|
vsub.w vr15, vr15, vr6 |
|
vadd.w vr17, vr17, vr2 |
|
LSX_BUTTERFLY_4_W vr11, vr13, vr15, vr17, vr10, vr12, vr14, vr16 |
|
vsrai.w vr11, vr7, 1 |
|
vsrai.w vr13, vr3, 1 |
|
vsrai.w vr15, vr5, 1 |
|
vsrai.w vr17, vr1, 1 |
|
vsub.w vr11, vr5, vr11 |
|
vsub.w vr13, vr7, vr13 |
|
vadd.w vr15, vr7, vr15 |
|
vadd.w vr17, vr5, vr17 |
|
vsub.w vr11, vr11, vr7 |
|
vsub.w vr13, vr13, vr3 |
|
vadd.w vr15, vr15, vr5 |
|
vadd.w vr17, vr17, vr1 |
|
vsub.w vr11, vr11, vr3 |
|
vadd.w vr13, vr13, vr1 |
|
vsub.w vr15, vr15, vr1 |
|
vadd.w vr17, vr17, vr3 |
|
vsrai.w vr0, vr11, 2 |
|
vsrai.w vr1, vr13, 2 |
|
vsrai.w vr2, vr15, 2 |
|
vsrai.w vr3, vr17, 2 |
|
vadd.w vr11, vr11, vr3 |
|
vadd.w vr13, vr13, vr2 |
|
vsub.w vr15, vr1, vr15 |
|
vsub.w vr17, vr17, vr0 |
|
LSX_BUTTERFLY_8_W vr10, vr12, vr14, vr16, vr11, vr13, vr15, vr17, \ |
|
vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 |
|
|
|
vadd.w vr11, vr20, vr8 |
|
vsub.w vr13, vr20, vr8 |
|
vsrai.w vr15, vr22, 1 |
|
vsrai.w vr17, vr18, 1 |
|
vsub.w vr15, vr15, vr18 |
|
vadd.w vr17, vr17, vr22 |
|
LSX_BUTTERFLY_4_W vr11, vr13, vr15, vr17, vr10, vr12, vr14, vr16 |
|
vsrai.w vr11, vr19, 1 |
|
vsrai.w vr13, vr23, 1 |
|
vsrai.w vr15, vr9, 1 |
|
vsrai.w vr17, vr21, 1 |
|
vsub.w vr11, vr9, vr11 |
|
vsub.w vr13, vr19, vr13 |
|
vadd.w vr15, vr19, vr15 |
|
vadd.w vr17, vr9, vr17 |
|
vsub.w vr11, vr11, vr19 |
|
vsub.w vr13, vr13, vr23 |
|
vadd.w vr15, vr15, vr9 |
|
vadd.w vr17, vr17, vr21 |
|
vsub.w vr11, vr11, vr23 |
|
vadd.w vr13, vr13, vr21 |
|
vsub.w vr15, vr15, vr21 |
|
vadd.w vr17, vr17, vr23 |
|
vsrai.w vr20, vr11, 2 |
|
vsrai.w vr21, vr13, 2 |
|
vsrai.w vr22, vr15, 2 |
|
vsrai.w vr23, vr17, 2 |
|
vadd.w vr11, vr11, vr23 |
|
vadd.w vr13, vr13, vr22 |
|
vsub.w vr15, vr21, vr15 |
|
vsub.w vr17, vr17, vr20 |
|
LSX_BUTTERFLY_8_W vr10, vr12, vr14, vr16, vr11, vr13, vr15, vr17, \ |
|
vr20, vr21, vr22, vr23, vr8, vr9, vr18, vr19 |
|
|
|
vld vr10, a0, 0 |
|
vldx vr11, a0, a2 |
|
vldx vr12, a0, t2 |
|
vldx vr13, a0, t3 |
|
vldx vr14, a0, t4 |
|
vldx vr15, a0, t5 |
|
vldx vr16, a0, t6 |
|
vldx vr17, a0, t7 |
|
vsrani.h.w vr20, vr0, 6 |
|
vsrani.h.w vr21, vr1, 6 |
|
vsrani.h.w vr22, vr2, 6 |
|
vsrani.h.w vr23, vr3, 6 |
|
vsrani.h.w vr8, vr4, 6 |
|
vsrani.h.w vr9, vr5, 6 |
|
vsrani.h.w vr18, vr6, 6 |
|
vsrani.h.w vr19, vr7, 6 |
|
vsllwil.hu.bu vr10, vr10, 0 |
|
vsllwil.hu.bu vr11, vr11, 0 |
|
vsllwil.hu.bu vr12, vr12, 0 |
|
vsllwil.hu.bu vr13, vr13, 0 |
|
vsllwil.hu.bu vr14, vr14, 0 |
|
vsllwil.hu.bu vr15, vr15, 0 |
|
vsllwil.hu.bu vr16, vr16, 0 |
|
vsllwil.hu.bu vr17, vr17, 0 |
|
|
|
vadd.h vr0, vr20, vr10 |
|
vadd.h vr1, vr21, vr11 |
|
vadd.h vr2, vr22, vr12 |
|
vadd.h vr3, vr23, vr13 |
|
vadd.h vr4, vr8, vr14 |
|
vadd.h vr5, vr9, vr15 |
|
vadd.h vr6, vr18, vr16 |
|
vadd.h vr7, vr19, vr17 |
|
vssrarni.bu.h vr1, vr0, 0 |
|
vssrarni.bu.h vr3, vr2, 0 |
|
vssrarni.bu.h vr5, vr4, 0 |
|
vssrarni.bu.h vr7, vr6, 0 |
|
vbsrl.v vr0, vr1, 8 |
|
vbsrl.v vr2, vr3, 8 |
|
vbsrl.v vr4, vr5, 8 |
|
vbsrl.v vr6, vr7, 8 |
|
fst.d f1, a0, 0 |
|
fstx.d f0, a0, a2 |
|
fstx.d f3, a0, t2 |
|
fstx.d f2, a0, t3 |
|
fstx.d f5, a0, t4 |
|
fstx.d f4, a0, t5 |
|
fstx.d f7, a0, t6 |
|
fstx.d f6, a0, t7 |
|
endfunc |
|
|
|
/* |
|
* #define FUNC2(a, b, c) FUNC3(a, b, c) |
|
* #define FUNCC(a) FUNC2(a, BIT_DEPTH, _c) |
|
* void FUNCC(ff_h264_idct8_add)(uint8_t *_dst, int16_t *_block, int stride) |
|
*/ |
|
function ff_h264_idct8_add_8_lasx |
|
ld.h t0, a1, 0 |
|
add.d t2, a2, a2 |
|
add.d t3, t2, a2 |
|
add.d t4, t3, a2 |
|
add.d t5, t4, a2 |
|
add.d t6, t5, a2 |
|
add.d t7, t6, a2 |
|
addi.w t0, t0, 32 |
|
st.h t0, a1, 0 |
|
|
|
vld vr0, a1, 0 |
|
vld vr1, a1, 16 |
|
vld vr2, a1, 32 |
|
vld vr3, a1, 48 |
|
vld vr4, a1, 64 |
|
vld vr5, a1, 80 |
|
vld vr6, a1, 96 |
|
vld vr7, a1, 112 |
|
xvxor.v xr8, xr8, xr8 |
|
xvst xr8, a1, 0 |
|
xvst xr8, a1, 32 |
|
xvst xr8, a1, 64 |
|
xvst xr8, a1, 96 |
|
|
|
vadd.h vr18, vr0, vr4 |
|
vsub.h vr19, vr0, vr4 |
|
vsrai.h vr20, vr2, 1 |
|
vsrai.h vr21, vr6, 1 |
|
vsub.h vr20, vr20, vr6 |
|
vadd.h vr21, vr21, vr2 |
|
LSX_BUTTERFLY_4_H vr18, vr19, vr20, vr21, vr10, vr12, vr14, vr16 |
|
vsrai.h vr11, vr7, 1 |
|
vsrai.h vr13, vr3, 1 |
|
vsrai.h vr15, vr5, 1 |
|
vsrai.h vr17, vr1, 1 |
|
vsub.h vr11, vr5, vr11 |
|
vsub.h vr13, vr7, vr13 |
|
vadd.h vr15, vr7, vr15 |
|
vadd.h vr17, vr5, vr17 |
|
vsub.h vr11, vr11, vr7 |
|
vsub.h vr13, vr13, vr3 |
|
vadd.h vr15, vr15, vr5 |
|
vadd.h vr17, vr17, vr1 |
|
vsub.h vr11, vr11, vr3 |
|
vadd.h vr13, vr13, vr1 |
|
vsub.h vr15, vr15, vr1 |
|
vadd.h vr17, vr17, vr3 |
|
vsrai.h vr18, vr11, 2 |
|
vsrai.h vr19, vr13, 2 |
|
vsrai.h vr20, vr15, 2 |
|
vsrai.h vr21, vr17, 2 |
|
vadd.h vr11, vr11, vr21 |
|
vadd.h vr13, vr13, vr20 |
|
vsub.h vr15, vr19, vr15 |
|
vsub.h vr17, vr17, vr18 |
|
LSX_BUTTERFLY_8_H vr10, vr16, vr12, vr14, vr13, vr15, vr11, vr17, \ |
|
vr0, vr3, vr1, vr2, vr5, vr6, vr4, vr7 |
|
|
|
LSX_TRANSPOSE8x8_H vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ |
|
vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ |
|
vr10, vr11, vr12, vr13, vr14, vr15, vr16, vr17 |
|
vext2xv.w.h xr0, xr0 |
|
vext2xv.w.h xr1, xr1 |
|
vext2xv.w.h xr2, xr2 |
|
vext2xv.w.h xr3, xr3 |
|
vext2xv.w.h xr4, xr4 |
|
vext2xv.w.h xr5, xr5 |
|
vext2xv.w.h xr6, xr6 |
|
vext2xv.w.h xr7, xr7 |
|
|
|
xvadd.w xr11, xr0, xr4 |
|
xvsub.w xr13, xr0, xr4 |
|
xvsrai.w xr15, xr2, 1 |
|
xvsrai.w xr17, xr6, 1 |
|
xvsub.w xr15, xr15, xr6 |
|
xvadd.w xr17, xr17, xr2 |
|
LASX_BUTTERFLY_4_W xr11, xr13, xr15, xr17, xr10, xr12, xr14, xr16 |
|
xvsrai.w xr11, xr7, 1 |
|
xvsrai.w xr13, xr3, 1 |
|
xvsrai.w xr15, xr5, 1 |
|
xvsrai.w xr17, xr1, 1 |
|
xvsub.w xr11, xr5, xr11 |
|
xvsub.w xr13, xr7, xr13 |
|
xvadd.w xr15, xr7, xr15 |
|
xvadd.w xr17, xr5, xr17 |
|
xvsub.w xr11, xr11, xr7 |
|
xvsub.w xr13, xr13, xr3 |
|
xvadd.w xr15, xr15, xr5 |
|
xvadd.w xr17, xr17, xr1 |
|
xvsub.w xr11, xr11, xr3 |
|
xvadd.w xr13, xr13, xr1 |
|
xvsub.w xr15, xr15, xr1 |
|
xvadd.w xr17, xr17, xr3 |
|
xvsrai.w xr0, xr11, 2 |
|
xvsrai.w xr1, xr13, 2 |
|
xvsrai.w xr2, xr15, 2 |
|
xvsrai.w xr3, xr17, 2 |
|
xvadd.w xr11, xr11, xr3 |
|
xvadd.w xr13, xr13, xr2 |
|
xvsub.w xr15, xr1, xr15 |
|
xvsub.w xr17, xr17, xr0 |
|
LASX_BUTTERFLY_8_W xr10, xr12, xr14, xr16, xr11, xr13, xr15, xr17, \ |
|
xr0, xr1, xr2, xr3, xr4, xr5, xr6, xr7 |
|
|
|
vld vr10, a0, 0 |
|
vldx vr11, a0, a2 |
|
vldx vr12, a0, t2 |
|
vldx vr13, a0, t3 |
|
vldx vr14, a0, t4 |
|
vldx vr15, a0, t5 |
|
vldx vr16, a0, t6 |
|
vldx vr17, a0, t7 |
|
xvldi xr8, 0x806 //"xvldi.w xr8 6" |
|
xvsran.h.w xr0, xr0, xr8 |
|
xvsran.h.w xr1, xr1, xr8 |
|
xvsran.h.w xr2, xr2, xr8 |
|
xvsran.h.w xr3, xr3, xr8 |
|
xvsran.h.w xr4, xr4, xr8 |
|
xvsran.h.w xr5, xr5, xr8 |
|
xvsran.h.w xr6, xr6, xr8 |
|
xvsran.h.w xr7, xr7, xr8 |
|
xvpermi.d xr0, xr0, 0x08 |
|
xvpermi.d xr1, xr1, 0x08 |
|
xvpermi.d xr2, xr2, 0x08 |
|
xvpermi.d xr3, xr3, 0x08 |
|
xvpermi.d xr4, xr4, 0x08 |
|
xvpermi.d xr5, xr5, 0x08 |
|
xvpermi.d xr6, xr6, 0x08 |
|
xvpermi.d xr7, xr7, 0x08 |
|
|
|
vsllwil.hu.bu vr10, vr10, 0 |
|
vsllwil.hu.bu vr11, vr11, 0 |
|
vsllwil.hu.bu vr12, vr12, 0 |
|
vsllwil.hu.bu vr13, vr13, 0 |
|
vsllwil.hu.bu vr14, vr14, 0 |
|
vsllwil.hu.bu vr15, vr15, 0 |
|
vsllwil.hu.bu vr16, vr16, 0 |
|
vsllwil.hu.bu vr17, vr17, 0 |
|
|
|
vadd.h vr0, vr0, vr10 |
|
vadd.h vr1, vr1, vr11 |
|
vadd.h vr2, vr2, vr12 |
|
vadd.h vr3, vr3, vr13 |
|
vadd.h vr4, vr4, vr14 |
|
vadd.h vr5, vr5, vr15 |
|
vadd.h vr6, vr6, vr16 |
|
vadd.h vr7, vr7, vr17 |
|
vssrarni.bu.h vr1, vr0, 0 |
|
vssrarni.bu.h vr3, vr2, 0 |
|
vssrarni.bu.h vr5, vr4, 0 |
|
vssrarni.bu.h vr7, vr6, 0 |
|
vbsrl.v vr0, vr1, 8 |
|
vbsrl.v vr2, vr3, 8 |
|
vbsrl.v vr4, vr5, 8 |
|
vbsrl.v vr6, vr7, 8 |
|
fst.d f1, a0, 0 |
|
fstx.d f0, a0, a2 |
|
fstx.d f3, a0, t2 |
|
fstx.d f2, a0, t3 |
|
fstx.d f5, a0, t4 |
|
fstx.d f4, a0, t5 |
|
fstx.d f7, a0, t6 |
|
fstx.d f6, a0, t7 |
|
endfunc |
|
|
|
/* |
|
* #define FUNC2(a, b, c) FUNC3(a, b, c) |
|
* #define FUNCC(a) FUNC2(a, BIT_DEPTH, _c) |
|
* void FUNCC(ff_h264_idct_dc_add)(uint8_t *_dst, int16_t *_block, int stride) |
|
* LSX optimization is enough for this function. |
|
*/ |
|
function ff_h264_idct_dc_add_8_lsx |
|
vldrepl.h vr4, a1, 0 |
|
add.d t2, a2, a2 |
|
add.d t3, t2, a2 |
|
fld.s f0, a0, 0 |
|
fldx.s f1, a0, a2 |
|
fldx.s f2, a0, t2 |
|
fldx.s f3, a0, t3 |
|
st.h zero, a1, 0 |
|
|
|
vsrari.h vr4, vr4, 6 |
|
vilvl.w vr0, vr1, vr0 |
|
vilvl.w vr1, vr3, vr2 |
|
vsllwil.hu.bu vr0, vr0, 0 |
|
vsllwil.hu.bu vr1, vr1, 0 |
|
vadd.h vr0, vr0, vr4 |
|
vadd.h vr1, vr1, vr4 |
|
vssrarni.bu.h vr1, vr0, 0 |
|
|
|
vbsrl.v vr2, vr1, 4 |
|
vbsrl.v vr3, vr1, 8 |
|
vbsrl.v vr4, vr1, 12 |
|
fst.s f1, a0, 0 |
|
fstx.s f2, a0, a2 |
|
fstx.s f3, a0, t2 |
|
fstx.s f4, a0, t3 |
|
endfunc |
|
|
|
/* |
|
* #define FUNC2(a, b, c) FUNC3(a, b, c) |
|
* #define FUNCC(a) FUNC2(a, BIT_DEPTH, _c) |
|
* void FUNCC(ff_h264_idct8_dc_add)(uint8_t *_dst, int16_t *_block, int stride) |
|
*/ |
|
function ff_h264_idct8_dc_add_8_lsx |
|
vldrepl.h vr8, a1, 0 |
|
add.d t2, a2, a2 |
|
add.d t3, t2, a2 |
|
add.d t4, t3, a2 |
|
add.d t5, t4, a2 |
|
add.d t6, t5, a2 |
|
add.d t7, t6, a2 |
|
|
|
fld.d f0, a0, 0 |
|
fldx.d f1, a0, a2 |
|
fldx.d f2, a0, t2 |
|
fldx.d f3, a0, t3 |
|
fldx.d f4, a0, t4 |
|
fldx.d f5, a0, t5 |
|
fldx.d f6, a0, t6 |
|
fldx.d f7, a0, t7 |
|
st.h zero, a1, 0 |
|
|
|
vsrari.h vr8, vr8, 6 |
|
vsllwil.hu.bu vr0, vr0, 0 |
|
vsllwil.hu.bu vr1, vr1, 0 |
|
vsllwil.hu.bu vr2, vr2, 0 |
|
vsllwil.hu.bu vr3, vr3, 0 |
|
vsllwil.hu.bu vr4, vr4, 0 |
|
vsllwil.hu.bu vr5, vr5, 0 |
|
vsllwil.hu.bu vr6, vr6, 0 |
|
vsllwil.hu.bu vr7, vr7, 0 |
|
vadd.h vr0, vr0, vr8 |
|
vadd.h vr1, vr1, vr8 |
|
vadd.h vr2, vr2, vr8 |
|
vadd.h vr3, vr3, vr8 |
|
vadd.h vr4, vr4, vr8 |
|
vadd.h vr5, vr5, vr8 |
|
vadd.h vr6, vr6, vr8 |
|
vadd.h vr7, vr7, vr8 |
|
vssrarni.bu.h vr1, vr0, 0 |
|
vssrarni.bu.h vr3, vr2, 0 |
|
vssrarni.bu.h vr5, vr4, 0 |
|
vssrarni.bu.h vr7, vr6, 0 |
|
|
|
vbsrl.v vr0, vr1, 8 |
|
vbsrl.v vr2, vr3, 8 |
|
vbsrl.v vr4, vr5, 8 |
|
vbsrl.v vr6, vr7, 8 |
|
fst.d f1, a0, 0 |
|
fstx.d f0, a0, a2 |
|
fstx.d f3, a0, t2 |
|
fstx.d f2, a0, t3 |
|
fstx.d f5, a0, t4 |
|
fstx.d f4, a0, t5 |
|
fstx.d f7, a0, t6 |
|
fstx.d f6, a0, t7 |
|
endfunc |
|
function ff_h264_idct8_dc_add_8_lasx |
|
xvldrepl.h xr8, a1, 0 |
|
add.d t2, a2, a2 |
|
add.d t3, t2, a2 |
|
add.d t4, t3, a2 |
|
add.d t5, t4, a2 |
|
add.d t6, t5, a2 |
|
add.d t7, t6, a2 |
|
|
|
fld.d f0, a0, 0 |
|
fldx.d f1, a0, a2 |
|
fldx.d f2, a0, t2 |
|
fldx.d f3, a0, t3 |
|
fldx.d f4, a0, t4 |
|
fldx.d f5, a0, t5 |
|
fldx.d f6, a0, t6 |
|
fldx.d f7, a0, t7 |
|
st.h zero, a1, 0 |
|
|
|
xvsrari.h xr8, xr8, 6 |
|
xvpermi.q xr1, xr0, 0x20 |
|
xvpermi.q xr3, xr2, 0x20 |
|
xvpermi.q xr5, xr4, 0x20 |
|
xvpermi.q xr7, xr6, 0x20 |
|
xvsllwil.hu.bu xr1, xr1, 0 |
|
xvsllwil.hu.bu xr3, xr3, 0 |
|
xvsllwil.hu.bu xr5, xr5, 0 |
|
xvsllwil.hu.bu xr7, xr7, 0 |
|
xvadd.h xr1, xr1, xr8 |
|
xvadd.h xr3, xr3, xr8 |
|
xvadd.h xr5, xr5, xr8 |
|
xvadd.h xr7, xr7, xr8 |
|
|
|
xvssrarni.bu.h xr3, xr1, 0 |
|
xvssrarni.bu.h xr7, xr5, 0 |
|
|
|
xvpermi.q xr1, xr3, 0x11 |
|
xvpermi.q xr5, xr7, 0x11 |
|
xvbsrl.v xr0, xr1, 8 |
|
xvbsrl.v xr2, xr3, 8 |
|
xvbsrl.v xr4, xr5, 8 |
|
xvbsrl.v xr6, xr7, 8 |
|
|
|
fst.d f3, a0, 0 |
|
fstx.d f1, a0, a2 |
|
fstx.d f2, a0, t2 |
|
fstx.d f0, a0, t3 |
|
fstx.d f7, a0, t4 |
|
fstx.d f5, a0, t5 |
|
fstx.d f6, a0, t6 |
|
fstx.d f4, a0, t7 |
|
endfunc |
|
|
|
/** |
|
* IDCT transforms the 16 dc values and dequantizes them. |
|
* @param qmul quantization parameter |
|
* void FUNCC(ff_h264_luma_dc_dequant_idct)(int16_t *_output, int16_t *_input, int qmul){ |
|
* LSX optimization is enough for this function. |
|
*/ |
|
function ff_h264_luma_dc_dequant_idct_8_lsx |
|
vld vr0, a1, 0 |
|
vld vr1, a1, 8 |
|
vld vr2, a1, 16 |
|
vld vr3, a1, 24 |
|
vreplgr2vr.w vr8, a2 |
|
LSX_TRANSPOSE4x4_H vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, vr9, vr10 |
|
LSX_BUTTERFLY_4_H vr4, vr6, vr7, vr5, vr0, vr3, vr2, vr1 |
|
LSX_BUTTERFLY_4_H vr0, vr1, vr2, vr3, vr4, vr7, vr6, vr5 |
|
LSX_TRANSPOSE4x4_H vr4, vr5, vr6, vr7, vr0, vr1, vr2, vr3, vr9, vr10 |
|
LSX_BUTTERFLY_4_H vr0, vr1, vr3, vr2, vr4, vr7, vr6, vr5 |
|
LSX_BUTTERFLY_4_H vr4, vr5, vr6, vr7, vr0, vr1, vr2, vr3 |
|
vsllwil.w.h vr0, vr0, 0 |
|
vsllwil.w.h vr1, vr1, 0 |
|
vsllwil.w.h vr2, vr2, 0 |
|
vsllwil.w.h vr3, vr3, 0 |
|
vmul.w vr0, vr0, vr8 |
|
vmul.w vr1, vr1, vr8 |
|
vmul.w vr2, vr2, vr8 |
|
vmul.w vr3, vr3, vr8 |
|
vsrarni.h.w vr1, vr0, 8 |
|
vsrarni.h.w vr3, vr2, 8 |
|
|
|
vstelm.h vr1, a0, 0, 0 |
|
vstelm.h vr1, a0, 32, 4 |
|
vstelm.h vr1, a0, 64, 1 |
|
vstelm.h vr1, a0, 96, 5 |
|
vstelm.h vr3, a0, 128, 0 |
|
vstelm.h vr3, a0, 160, 4 |
|
vstelm.h vr3, a0, 192, 1 |
|
vstelm.h vr3, a0, 224, 5 |
|
addi.d a0, a0, 256 |
|
vstelm.h vr1, a0, 0, 2 |
|
vstelm.h vr1, a0, 32, 6 |
|
vstelm.h vr1, a0, 64, 3 |
|
vstelm.h vr1, a0, 96, 7 |
|
vstelm.h vr3, a0, 128, 2 |
|
vstelm.h vr3, a0, 160, 6 |
|
vstelm.h vr3, a0, 192, 3 |
|
vstelm.h vr3, a0, 224, 7 |
|
endfunc
|
|
|