mirror of https://github.com/FFmpeg/FFmpeg.git
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
1686 lines
60 KiB
1686 lines
60 KiB
/* |
|
* Loongson LSX optimized h264qpel |
|
* |
|
* Copyright (c) 2023 Loongson Technology Corporation Limited |
|
* Contributed by Hecai Yuan <yuanhecai@loongson.cn> |
|
* |
|
* This file is part of FFmpeg. |
|
* |
|
* FFmpeg is free software; you can redistribute it and/or |
|
* modify it under the terms of the GNU Lesser General Public |
|
* License as published by the Free Software Foundation; either |
|
* version 2.1 of the License, or (at your option) any later version. |
|
* |
|
* FFmpeg is distributed in the hope that it will be useful, |
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
|
* Lesser General Public License for more details. |
|
* |
|
* You should have received a copy of the GNU Lesser General Public |
|
* License along with FFmpeg; if not, write to the Free Software |
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
|
*/ |
|
|
|
#include "loongson_asm.S" |
|
|
|
.macro VLD_QPEL8_H_SSRANI_LSX in0, in1, in2, in3, in4 |
|
vld vr0, \in4, 0 |
|
vldx vr1, \in4, a2 |
|
QPEL8_H_LSX \in0, \in1 |
|
vssrani.bu.h \in0, \in2, 5 |
|
vssrani.bu.h \in1, \in3, 5 |
|
.endm |
|
|
|
.macro VLDX_QPEL8_H_SSRANI_LSX in0, in1, in2, in3, in4 |
|
vldx vr0, \in4, t1 |
|
vldx vr1, \in4, t2 |
|
QPEL8_H_LSX \in0, \in1 |
|
vssrani.bu.h \in0, \in2, 5 |
|
vssrani.bu.h \in1, \in3, 5 |
|
.endm |
|
|
|
.macro VLD_DOUBLE_QPEL8_H_SSRANI_LSX in0, in1, in2, in3, in4, in5, in6, in7, in8 |
|
vld vr0, \in8, 0 |
|
vldx vr1, \in8, a2 |
|
QPEL8_H_LSX \in0, \in1 |
|
vssrani.bu.h \in0, \in4, 5 |
|
vssrani.bu.h \in1, \in5, 5 |
|
vldx vr0, \in8, t1 |
|
vldx vr1, \in8, t2 |
|
QPEL8_H_LSX \in2, \in3 |
|
vssrani.bu.h \in2, \in6, 5 |
|
vssrani.bu.h \in3, \in7, 5 |
|
.endm |
|
|
|
function ff_put_h264_qpel16_mc00_lsx |
|
slli.d t0, a2, 1 |
|
add.d t1, t0, a2 |
|
slli.d t2, t0, 1 |
|
.rept 4 |
|
vld vr0, a1, 0 |
|
vldx vr1, a1, a2 |
|
vldx vr2, a1, t0 |
|
vldx vr3, a1, t1 |
|
add.d a1, a1, t2 |
|
vst vr0, a0, 0 |
|
vstx vr1, a0, a2 |
|
vstx vr2, a0, t0 |
|
vstx vr3, a0, t1 |
|
add.d a0, a0, t2 |
|
.endr |
|
endfunc |
|
|
|
.macro QPEL8_H_LSX out0, out1 |
|
vbsrl.v vr2, vr0, 1 |
|
vbsrl.v vr3, vr1, 1 |
|
vbsrl.v vr4, vr0, 2 |
|
vbsrl.v vr5, vr1, 2 |
|
vbsrl.v vr6, vr0, 3 |
|
vbsrl.v vr7, vr1, 3 |
|
vbsrl.v vr8, vr0, 4 |
|
vbsrl.v vr9, vr1, 4 |
|
vbsrl.v vr10, vr0, 5 |
|
vbsrl.v vr11, vr1, 5 |
|
|
|
vilvl.b vr6, vr4, vr6 |
|
vilvl.b vr7, vr5, vr7 |
|
vilvl.b vr8, vr2, vr8 |
|
vilvl.b vr9, vr3, vr9 |
|
vilvl.b vr10, vr0, vr10 |
|
vilvl.b vr11, vr1, vr11 |
|
vhaddw.hu.bu vr6, vr6, vr6 |
|
vhaddw.hu.bu vr7, vr7, vr7 |
|
vhaddw.hu.bu vr8, vr8, vr8 |
|
vhaddw.hu.bu vr9, vr9, vr9 |
|
vhaddw.hu.bu vr10, vr10, vr10 |
|
vhaddw.hu.bu vr11, vr11, vr11 |
|
vmul.h vr2, vr6, vr20 |
|
vmul.h vr3, vr7, vr20 |
|
vmul.h vr4, vr8, vr21 |
|
vmul.h vr5, vr9, vr21 |
|
vssub.h vr2, vr2, vr4 |
|
vssub.h vr3, vr3, vr5 |
|
vsadd.h vr2, vr2, vr10 |
|
vsadd.h vr3, vr3, vr11 |
|
vsadd.h \out0, vr2, vr22 |
|
vsadd.h \out1, vr3, vr22 |
|
.endm |
|
|
|
.macro VLD_DOUBLE_QPEL8_H_LSX in0, in1, in2, in3, in4 |
|
vld vr0, \in4, 0 |
|
vldx vr1, \in4, a2 |
|
QPEL8_H_LSX \in0, \in1 |
|
vldx vr0, \in4, t1 |
|
vldx vr1, \in4, t2 |
|
QPEL8_H_LSX \in2, \in3 |
|
.endm |
|
|
|
.macro put_h264_qpel16 in0 |
|
function ff_put_h264_qpel16_mc\in0\()_lsx |
|
.ifc \in0, 10 |
|
addi.d t8, a1, 0 |
|
.else |
|
addi.d t8, a1, 1 |
|
.endif |
|
vldi vr20, 0x414 |
|
vldi vr21, 0x405 |
|
vldi vr22, 0x410 |
|
slli.d t1, a2, 1 |
|
add.d t2, t1, a2 |
|
addi.d t0, a1, -2 // t0 = src - 2 |
|
addi.d a1, t0, 8 // a1 = t0 + 8 |
|
.rept 4 |
|
VLD_DOUBLE_QPEL8_H_LSX vr12, vr13, vr14, vr15, t0 |
|
VLD_QPEL8_H_SSRANI_LSX vr2, vr3, vr12, vr13, a1 |
|
vld vr10, t8, 0 |
|
vldx vr11, t8, a2 |
|
vavgr.bu vr0, vr2, vr10 |
|
vavgr.bu vr1, vr3, vr11 |
|
vst vr0, a0, 0 |
|
vstx vr1, a0, a2 |
|
VLDX_QPEL8_H_SSRANI_LSX vr4, vr5, vr14, vr15, a1 |
|
vldx vr12, t8, t1 |
|
vldx vr13, t8, t2 |
|
vavgr.bu vr2, vr4, vr12 |
|
vavgr.bu vr3, vr5, vr13 |
|
vstx vr2, a0, t1 |
|
vstx vr3, a0, t2 |
|
alsl.d a0, a2, a0, 2 |
|
alsl.d t8, a2, t8, 2 |
|
alsl.d a1, a2, a1, 2 |
|
alsl.d t0, a2, t0, 2 |
|
.endr |
|
endfunc |
|
.endm |
|
|
|
put_h264_qpel16 10 |
|
put_h264_qpel16 30 |
|
|
|
function ff_put_h264_qpel16_mc20_lsx |
|
vldi vr20, 0x414 |
|
vldi vr21, 0x405 |
|
vldi vr22, 0x410 |
|
slli.d t1, a2, 1 |
|
add.d t2, t1, a2 |
|
addi.d t0, a1, -2 // t0 = src - 2 |
|
addi.d a1, t0, 8 // a1 = t0 + 8 |
|
.rept 4 |
|
VLD_DOUBLE_QPEL8_H_LSX vr12, vr13, vr14, vr15, t0 |
|
VLD_QPEL8_H_SSRANI_LSX vr2, vr3, vr12, vr13, a1 |
|
vst vr2, a0, 0 |
|
vstx vr3, a0, a2 |
|
VLDX_QPEL8_H_SSRANI_LSX vr4, vr5, vr14, vr15, a1 |
|
vstx vr4, a0, t1 |
|
vstx vr5, a0, t2 |
|
alsl.d a0, a2, a0, 2 |
|
alsl.d a1, a2, a1, 2 |
|
alsl.d t0, a2, t0, 2 |
|
.endr |
|
endfunc |
|
|
|
.macro QPEL8_V_LSX in0, in1, in2, in3, in4, in5, in6 |
|
vilvl.b vr7, \in3, \in2 |
|
vilvl.b vr8, \in4, \in3 |
|
vilvl.b vr9, \in4, \in1 |
|
vilvl.b vr10, \in5, \in2 |
|
vilvl.b vr11, \in5, \in0 |
|
vilvl.b vr12, \in6, \in1 |
|
vhaddw.hu.bu vr7, vr7, vr7 |
|
vhaddw.hu.bu vr8, vr8, vr8 |
|
vhaddw.hu.bu vr9, vr9, vr9 |
|
vhaddw.hu.bu vr10, vr10, vr10 |
|
vhaddw.hu.bu vr11, vr11, vr11 |
|
vhaddw.hu.bu vr12, vr12, vr12 |
|
vmul.h vr7, vr7, vr20 |
|
vmul.h vr8, vr8, vr20 |
|
vmul.h vr9, vr9, vr21 |
|
vmul.h vr10, vr10, vr21 |
|
vssub.h vr7, vr7, vr9 |
|
vssub.h vr8, vr8, vr10 |
|
vsadd.h vr7, vr7, vr11 |
|
vsadd.h vr8, vr8, vr12 |
|
vsadd.h vr7, vr7, vr22 |
|
vsadd.h vr8, vr8, vr22 |
|
|
|
vilvh.b vr13, \in3, \in2 |
|
vilvh.b vr14, \in4, \in3 |
|
vilvh.b vr15, \in4, \in1 |
|
vilvh.b vr16, \in5, \in2 |
|
vilvh.b vr17, \in5, \in0 |
|
vilvh.b vr18, \in6, \in1 |
|
vhaddw.hu.bu vr13, vr13, vr13 |
|
vhaddw.hu.bu vr14, vr14, vr14 |
|
vhaddw.hu.bu vr15, vr15, vr15 |
|
vhaddw.hu.bu vr16, vr16, vr16 |
|
vhaddw.hu.bu vr17, vr17, vr17 |
|
vhaddw.hu.bu vr18, vr18, vr18 |
|
vmul.h vr13, vr13, vr20 |
|
vmul.h vr14, vr14, vr20 |
|
vmul.h vr15, vr15, vr21 |
|
vmul.h vr16, vr16, vr21 |
|
vssub.h vr13, vr13, vr15 |
|
vssub.h vr14, vr14, vr16 |
|
vsadd.h vr13, vr13, vr17 |
|
vsadd.h vr14, vr14, vr18 |
|
vsadd.h vr13, vr13, vr22 |
|
vsadd.h vr14, vr14, vr22 |
|
vssrani.bu.h vr13, vr7, 5 |
|
vssrani.bu.h vr14, vr8, 5 |
|
.endm |
|
|
|
.macro put_h264_qpel16_mc1 in0 |
|
function ff_put_h264_qpel16_mc\in0\()_lsx |
|
slli.d t0, a2, 1 |
|
add.d t1, t0, a2 |
|
sub.d t2, a1, t0 // t2 = src - 2 * stride |
|
vldi vr20, 0x414 |
|
vldi vr21, 0x405 |
|
vldi vr22, 0x410 |
|
|
|
vld vr0, t2, 0 |
|
vldx vr1, t2, a2 |
|
vldx vr2, t2, t0 |
|
vldx vr3, t2, t1 |
|
alsl.d t2, a2, t2, 2 // t2 = t2 + 4 * stride |
|
vld vr4, t2, 0 |
|
vldx vr5, t2, a2 |
|
vldx vr6, t2, t0 |
|
QPEL8_V_LSX vr0, vr1, vr2, vr3, vr4, vr5, vr6 |
|
.ifc \in0, 01 |
|
vavgr.bu vr13, vr2, vr13 |
|
vavgr.bu vr14, vr3, vr14 |
|
.else |
|
vavgr.bu vr13, vr3, vr13 |
|
vavgr.bu vr14, vr4, vr14 |
|
.endif |
|
vst vr13, a0, 0 |
|
vstx vr14, a0, a2 |
|
|
|
vldx vr0, t2, t1 |
|
alsl.d t2, a2, t2, 2 // t2 = t2 + 4 *stride |
|
vld vr1, t2, 0 |
|
QPEL8_V_LSX vr2, vr3, vr4, vr5, vr6, vr0, vr1 |
|
.ifc \in0, 01 |
|
vavgr.bu vr13, vr4, vr13 |
|
vavgr.bu vr14, vr5, vr14 |
|
.else |
|
vavgr.bu vr13, vr5, vr13 |
|
vavgr.bu vr14, vr6, vr14 |
|
.endif |
|
vstx vr13, a0, t0 |
|
vstx vr14, a0, t1 |
|
|
|
alsl.d a0, a2, a0, 2 // dst = dst + 4 * stride |
|
|
|
vldx vr2, t2, a2 |
|
vldx vr3, t2, t0 |
|
QPEL8_V_LSX vr4, vr5, vr6, vr0, vr1, vr2, vr3 |
|
.ifc \in0, 01 |
|
vavgr.bu vr13, vr6, vr13 |
|
vavgr.bu vr14, vr0, vr14 |
|
.else |
|
vavgr.bu vr13, vr0, vr13 |
|
vavgr.bu vr14, vr1, vr14 |
|
.endif |
|
vst vr13, a0, 0 |
|
vstx vr14, a0, a2 |
|
|
|
vldx vr4, t2, t1 |
|
alsl.d t2, a2, t2, 2 // t2 = t2 + 4 * stride |
|
vld vr5, t2, 0 |
|
QPEL8_V_LSX vr6, vr0, vr1, vr2, vr3, vr4, vr5 |
|
.ifc \in0, 01 |
|
vavgr.bu vr13, vr1, vr13 |
|
vavgr.bu vr14, vr2, vr14 |
|
.else |
|
vavgr.bu vr13, vr2, vr13 |
|
vavgr.bu vr14, vr3, vr14 |
|
.endif |
|
vstx vr13, a0, t0 |
|
vstx vr14, a0, t1 |
|
|
|
alsl.d a0, a2, a0, 2 // dst = dst + 4 * stride |
|
|
|
vldx vr6, t2, a2 |
|
vldx vr0, t2, t0 |
|
QPEL8_V_LSX vr1, vr2, vr3, vr4, vr5, vr6, vr0 |
|
.ifc \in0, 01 |
|
vavgr.bu vr13, vr3, vr13 |
|
vavgr.bu vr14, vr4, vr14 |
|
.else |
|
vavgr.bu vr13, vr4, vr13 |
|
vavgr.bu vr14, vr5, vr14 |
|
.endif |
|
vst vr13, a0, 0 |
|
vstx vr14, a0, a2 |
|
|
|
vldx vr1, t2, t1 |
|
alsl.d t2, a2, t2, 2 // t2 = t2 + 4 * stride |
|
vld vr2, t2, 0 |
|
QPEL8_V_LSX vr3, vr4, vr5, vr6, vr0, vr1, vr2 |
|
.ifc \in0, 01 |
|
vavgr.bu vr13, vr5, vr13 |
|
vavgr.bu vr14, vr6, vr14 |
|
.else |
|
vavgr.bu vr13, vr6, vr13 |
|
vavgr.bu vr14, vr0, vr14 |
|
.endif |
|
vstx vr13, a0, t0 |
|
vstx vr14, a0, t1 |
|
|
|
alsl.d a0, a2, a0, 2 // dst = dst + 4 * stride |
|
|
|
vldx vr3, t2, a2 |
|
vldx vr4, t2, t0 |
|
QPEL8_V_LSX vr5, vr6, vr0, vr1, vr2, vr3, vr4 |
|
.ifc \in0, 01 |
|
vavgr.bu vr13, vr0, vr13 |
|
vavgr.bu vr14, vr1, vr14 |
|
.else |
|
vavgr.bu vr13, vr1, vr13 |
|
vavgr.bu vr14, vr2, vr14 |
|
.endif |
|
vst vr13, a0, 0 |
|
vstx vr14, a0, a2 |
|
|
|
vldx vr5, t2, t1 |
|
alsl.d t2, a2, t2, 2 // t2 = t2 + 4 * stride |
|
vld vr6, t2, 0 |
|
QPEL8_V_LSX vr0, vr1, vr2, vr3, vr4, vr5, vr6 |
|
.ifc \in0, 01 |
|
vavgr.bu vr13, vr2, vr13 |
|
vavgr.bu vr14, vr3, vr14 |
|
.else |
|
vavgr.bu vr13, vr3, vr13 |
|
vavgr.bu vr14, vr4, vr14 |
|
.endif |
|
vstx vr13, a0, t0 |
|
vstx vr14, a0, t1 |
|
endfunc |
|
.endm |
|
|
|
put_h264_qpel16_mc1 01 |
|
put_h264_qpel16_mc1 03 |
|
|
|
.macro VST_QPEL8_V_LOWPASS_LSX in0, in1, in2, in3, in4, in5, in6, in7, in8 |
|
QPEL8_V_LSX \in0, \in1, \in2, \in3, \in4, \in5, \in6 |
|
vavgr.bu vr13, \in7, vr13 |
|
vavgr.bu vr14, \in8, vr14 |
|
vst vr13, a0, 0 |
|
vstx vr14, a0, a2 |
|
.endm |
|
|
|
.macro VSTX_QPEL8_V_LOWPASS_LSX in0, in1, in2, in3, in4, in5, in6, in7, in8 |
|
QPEL8_V_LSX \in0, \in1, \in2, \in3, \in4, \in5, \in6 |
|
vavgr.bu vr13, \in7, vr13 |
|
vavgr.bu vr14, \in8, vr14 |
|
vstx vr13, a0, t1 |
|
vstx vr14, a0, t2 |
|
.endm |
|
|
|
function ff_put_h264_qpel16_mc11_lsx |
|
addi.d sp, sp, -64 |
|
fst.d f24, sp, 0 |
|
fst.d f25, sp, 8 |
|
fst.d f26, sp, 16 |
|
fst.d f27, sp, 24 |
|
fst.d f28, sp, 32 |
|
fst.d f29, sp, 40 |
|
fst.d f30, sp, 48 |
|
fst.d f31, sp, 56 |
|
slli.d t1, a2, 1 |
|
add.d t2, t1, a2 |
|
slli.d t6, t1, 1 |
|
vldi vr20, 0x414 |
|
vldi vr21, 0x405 |
|
vldi vr22, 0x410 |
|
sub.d t4, a1, t1 // t4 = src - 2 * stride |
|
addi.d t0, a1, -2 // t0 = src - 2 |
|
addi.d a1, t0, 8 // a1 = t0 + 8 |
|
.rept 2 |
|
VLD_DOUBLE_QPEL8_H_LSX vr12, vr13, vr14, vr15, t0 |
|
alsl.d t0, a2, t0, 2 |
|
VLD_DOUBLE_QPEL8_H_LSX vr16, vr17, vr18, vr19, t0 |
|
VLD_DOUBLE_QPEL8_H_SSRANI_LSX vr23, vr24, vr25, vr26, vr12, vr13, \ |
|
vr14, vr15, a1 |
|
alsl.d a1, a2, a1, 2 |
|
VLD_DOUBLE_QPEL8_H_SSRANI_LSX vr27, vr28, vr29, vr30, vr16, vr17, \ |
|
vr18, vr19, a1 |
|
|
|
vld vr0, t4, 0 // t4 = src - 2 * stride |
|
vldx vr1, t4, a2 |
|
vldx vr2, t4, t1 |
|
vldx vr3, t4, t2 |
|
alsl.d t4, a2, t4, 2 // src + 2 *stride |
|
vld vr4, t4, 0 |
|
vldx vr5, t4, a2 |
|
vldx vr6, t4, t1 |
|
VST_QPEL8_V_LOWPASS_LSX vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr23, vr24 |
|
vldx vr0, t4, t2 |
|
alsl.d t4, a2, t4, 2 // src + 6 *stride |
|
vld vr1, t4, 0 |
|
VSTX_QPEL8_V_LOWPASS_LSX vr2, vr3, vr4, vr5, vr6, vr0, vr1, vr25, vr26 |
|
alsl.d a0, a2, a0, 2 // dst = dst + 4 * stride |
|
vldx vr2, t4, a2 |
|
vldx vr3, t4, t1 |
|
VST_QPEL8_V_LOWPASS_LSX vr4, vr5, vr6, vr0, vr1, vr2, vr3, vr27, vr28 |
|
vldx vr4, t4, t2 |
|
alsl.d t4, a2, t4, 2 // src + 10 *stride |
|
vld vr5, t4, 0 |
|
VSTX_QPEL8_V_LOWPASS_LSX vr6, vr0, vr1, vr2, vr3, vr4, vr5, vr29, vr30 |
|
alsl.d t0, a2, t0, 2 |
|
alsl.d a1, a2, a1, 2 // a1 = src + 8 * stride |
|
alsl.d a0, a2, a0, 2 // dst = dst + 8 * stride |
|
sub.d t4, t4, t6 |
|
.endr |
|
fld.d f24, sp, 0 |
|
fld.d f25, sp, 8 |
|
fld.d f26, sp, 16 |
|
fld.d f27, sp, 24 |
|
fld.d f28, sp, 32 |
|
fld.d f29, sp, 40 |
|
fld.d f30, sp, 48 |
|
fld.d f31, sp, 56 |
|
addi.d sp, sp, 64 |
|
endfunc |
|
|
|
function ff_avg_h264_qpel16_mc00_lsx |
|
slli.d t0, a2, 1 |
|
add.d t1, t0, a2 |
|
slli.d t2, t0, 1 |
|
addi.d t3, a0, 0 |
|
.rept 4 |
|
vld vr0, a1, 0 |
|
vldx vr1, a1, a2 |
|
vldx vr2, a1, t0 |
|
vldx vr3, a1, t1 |
|
add.d a1, a1, t2 |
|
vld vr8, t3, 0 |
|
vldx vr9, t3, a2 |
|
vldx vr10, t3, t0 |
|
vldx vr11, t3, t1 |
|
add.d t3, t3, t2 |
|
vavgr.bu vr0, vr8, vr0 |
|
vavgr.bu vr1, vr9, vr1 |
|
vavgr.bu vr2, vr10, vr2 |
|
vavgr.bu vr3, vr11, vr3 |
|
vst vr0, a0, 0 |
|
vstx vr1, a0, a2 |
|
vstx vr2, a0, t0 |
|
vstx vr3, a0, t1 |
|
add.d a0, a0, t2 |
|
.endr |
|
endfunc |
|
|
|
.macro put_h264_qpel16_mc in0 |
|
function ff_put_h264_qpel16_mc\in0\()_lsx |
|
addi.d sp, sp, -64 |
|
fst.d f24, sp, 0 |
|
fst.d f25, sp, 8 |
|
fst.d f26, sp, 16 |
|
fst.d f27, sp, 24 |
|
fst.d f28, sp, 32 |
|
fst.d f29, sp, 40 |
|
fst.d f30, sp, 48 |
|
fst.d f31, sp, 56 |
|
slli.d t1, a2, 1 |
|
add.d t2, t1, a2 |
|
vldi vr20, 0x414 |
|
vldi vr21, 0x405 |
|
vldi vr22, 0x410 |
|
addi.d t0, a1, -2 // t0 = src - 2 |
|
|
|
.ifc \in0, 33 |
|
add.d t0, t0, a2 |
|
.endif |
|
add.d t3, a1, zero // t3 = src |
|
sub.d t4, a1, t1 // t4 = src - 2 * stride |
|
addi.d t4, t4, 1 |
|
|
|
VLD_DOUBLE_QPEL8_H_LSX vr12, vr13, vr14, vr15, t0 |
|
alsl.d a1, a2, t0, 2 |
|
VLD_DOUBLE_QPEL8_H_LSX vr16, vr17, vr18, vr19, a1 |
|
addi.d a1, t0, 8 |
|
VLD_DOUBLE_QPEL8_H_SSRANI_LSX vr23, vr24, vr25, vr26, vr12, vr13, \ |
|
vr14, vr15, a1 |
|
alsl.d a1, a2, a1, 2 |
|
VLD_DOUBLE_QPEL8_H_SSRANI_LSX vr27, vr28, vr29, vr30, vr16, vr17, \ |
|
vr18, vr19, a1 |
|
vld vr0, t4, 0 // t4 = src - 2 * stride + 1 |
|
vldx vr1, t4, a2 |
|
vldx vr2, t4, t1 |
|
vldx vr3, t4, t2 |
|
alsl.d t4, a2, t4, 2 |
|
vld vr4, t4, 0 |
|
vldx vr5, t4, a2 |
|
vldx vr6, t4, t1 |
|
VST_QPEL8_V_LOWPASS_LSX vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr23, vr24 |
|
vldx vr0, t4, t2 |
|
alsl.d t4, a2, t4, 2 |
|
vld vr1, t4, 0 |
|
VSTX_QPEL8_V_LOWPASS_LSX vr2, vr3, vr4, vr5, vr6, vr0, vr1, vr25, vr26 |
|
add.d t6, t4, zero // t6 = src + 6 * stride |
|
alsl.d a0, a2, a0, 2 // dst = dst + 4 * stride |
|
vldx vr2, t4, a2 |
|
vldx vr3, t4, t1 |
|
VST_QPEL8_V_LOWPASS_LSX vr4, vr5, vr6, vr0, vr1, vr2, vr3, vr27, vr28 |
|
vldx vr4, t4, t2 |
|
alsl.d t4, a2, t4, 2 |
|
vld vr5, t4, 0 |
|
VSTX_QPEL8_V_LOWPASS_LSX vr6, vr0, vr1, vr2, vr3, vr4, vr5, vr29, vr30 |
|
alsl.d a1, a2, t0, 3 // a1 = src + 8 * stride |
|
addi.d t5, a1, 8 // a1 = src + 8 * stride + 8 |
|
VLD_DOUBLE_QPEL8_H_LSX vr12, vr13, vr14, vr15, a1 |
|
alsl.d a1, a2, a1, 2 |
|
VLD_DOUBLE_QPEL8_H_LSX vr16, vr17, vr18, vr19, a1 |
|
VLD_DOUBLE_QPEL8_H_SSRANI_LSX vr23, vr24, vr25, vr26, vr12, vr13, \ |
|
vr14, vr15, t5 |
|
alsl.d t5, a2, t5, 2 |
|
VLD_DOUBLE_QPEL8_H_SSRANI_LSX vr27, vr28, vr29, vr30, vr16, vr17, \ |
|
vr18, vr19, t5 |
|
alsl.d a0, a2, a0, 2 // dst = dst + 4 * stride |
|
|
|
// t6 = src + 6 * stride + 1 |
|
vld vr0, t6, 0 |
|
vldx vr1, t6, a2 |
|
vldx vr2, t6, t1 |
|
vldx vr3, t6, t2 |
|
alsl.d t6, a2, t6, 2 |
|
vld vr4, t6, 0 |
|
vldx vr5, t6, a2 |
|
vldx vr6, t6, t1 |
|
VST_QPEL8_V_LOWPASS_LSX vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr23, vr24 |
|
vldx vr0, t6, t2 |
|
alsl.d t6, a2, t6, 2 |
|
vld vr1, t6, 0 |
|
VSTX_QPEL8_V_LOWPASS_LSX vr2, vr3, vr4, vr5 ,vr6, vr0, vr1, vr25, vr26 |
|
alsl.d a0, a2, a0, 2 // dst = dst + 4 *stride |
|
vldx vr2, t6, a2 |
|
vldx vr3, t6, t1 |
|
VST_QPEL8_V_LOWPASS_LSX vr4, vr5, vr6, vr0, vr1, vr2, vr3, vr27, vr28 |
|
vldx vr4, t6, t2 |
|
alsl.d t6, a2, t6, 2 |
|
vld vr5, t6, 0 |
|
VSTX_QPEL8_V_LOWPASS_LSX vr6, vr0, vr1, vr2, vr3, vr4, vr5, vr29, vr30 |
|
|
|
fld.d f24, sp, 0 |
|
fld.d f25, sp, 8 |
|
fld.d f26, sp, 16 |
|
fld.d f27, sp, 24 |
|
fld.d f28, sp, 32 |
|
fld.d f29, sp, 40 |
|
fld.d f30, sp, 48 |
|
fld.d f31, sp, 56 |
|
addi.d sp, sp, 64 |
|
endfunc |
|
.endm |
|
|
|
put_h264_qpel16_mc 33 |
|
put_h264_qpel16_mc 31 |
|
|
|
function ff_put_h264_qpel16_mc13_lsx |
|
slli.d t1, a2, 1 |
|
add.d t2, t1, a2 |
|
vldi vr20, 0x414 |
|
vldi vr21, 0x405 |
|
vldi vr22, 0x410 |
|
addi.d sp, sp, -64 |
|
fst.d f24, sp, 0 |
|
fst.d f25, sp, 8 |
|
fst.d f26, sp, 16 |
|
fst.d f27, sp, 24 |
|
fst.d f28, sp, 32 |
|
fst.d f29, sp, 40 |
|
fst.d f30, sp, 48 |
|
fst.d f31, sp, 56 |
|
addi.d t0, a1, -2 // t0 = src - 2 |
|
add.d t0, t0, a2 |
|
add.d t3, a1, zero // t3 = src |
|
sub.d t4, a1, t1 // t4 = src - 2 * stride |
|
|
|
VLD_DOUBLE_QPEL8_H_LSX vr12, vr13, vr14, vr15, t0 |
|
alsl.d a1, a2, t0, 2 |
|
VLD_DOUBLE_QPEL8_H_LSX vr16, vr17, vr18, vr19, a1 |
|
addi.d a1, t0, 8 |
|
VLD_DOUBLE_QPEL8_H_SSRANI_LSX vr23, vr24, vr25, vr26, vr12, vr13, \ |
|
vr14, vr15, a1 |
|
alsl.d a1, a2, a1, 2 |
|
VLD_DOUBLE_QPEL8_H_SSRANI_LSX vr27, vr28, vr29, vr30, vr16, vr17, \ |
|
vr18, vr19, a1 |
|
vld vr0, t4, 0 // t4 = src - 2 * stride + 1 |
|
vldx vr1, t4, a2 |
|
vldx vr2, t4, t1 |
|
vldx vr3, t4, t2 |
|
alsl.d t4, a2, t4, 2 |
|
vld vr4, t4, 0 |
|
vldx vr5, t4, a2 |
|
vldx vr6, t4, t1 |
|
VST_QPEL8_V_LOWPASS_LSX vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr23, vr24 |
|
vldx vr0, t4, t2 |
|
alsl.d t4, a2, t4, 2 |
|
vld vr1, t4, 0 |
|
VSTX_QPEL8_V_LOWPASS_LSX vr2, vr3, vr4, vr5, vr6, vr0, vr1, vr25, vr26 |
|
add.d t6, t4, zero |
|
alsl.d a0, a2, a0, 2 // dst = dst + 4 * stride |
|
vldx vr2, t4, a2 |
|
vldx vr3, t4, t1 |
|
VST_QPEL8_V_LOWPASS_LSX vr4, vr5, vr6, vr0, vr1, vr2, vr3, vr27, vr28 |
|
vldx vr4, t4, t2 |
|
alsl.d t4, a2, t4, 2 |
|
vld vr5, t4, 0 |
|
VSTX_QPEL8_V_LOWPASS_LSX vr6, vr0, vr1, vr2, vr3, vr4, vr5, vr29, vr30 |
|
alsl.d a1, a2, t0, 3 // a1 = src + 8 * stride |
|
addi.d t5, a1, 8 // a1 = src + 8 * stride + 8 |
|
VLD_DOUBLE_QPEL8_H_LSX vr12, vr13, vr14, vr15, a1 |
|
alsl.d a1, a2, a1, 2 |
|
VLD_DOUBLE_QPEL8_H_LSX vr16, vr17, vr18, vr19, a1 |
|
VLD_DOUBLE_QPEL8_H_SSRANI_LSX vr23, vr24, vr25, vr26, vr12, vr13, \ |
|
vr14, vr15, t5 |
|
alsl.d t5, a2, t5, 2 |
|
VLD_DOUBLE_QPEL8_H_SSRANI_LSX vr27, vr28, vr29, vr30, vr16, vr17, \ |
|
vr18, vr19, t5 |
|
alsl.d a0, a2, a0, 2 // dst = dst + 4 * stride |
|
|
|
vld vr0, t6, 0 // // t6 = src + 6 * stride + 1 |
|
vldx vr1, t6, a2 |
|
vldx vr2, t6, t1 |
|
vldx vr3, t6, t2 |
|
alsl.d t6, a2, t6, 2 |
|
vld vr4, t6, 0 |
|
vldx vr5, t6, a2 |
|
vldx vr6, t6, t1 |
|
VST_QPEL8_V_LOWPASS_LSX vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr23, vr24 |
|
vldx vr0, t6, t2 |
|
alsl.d t6, a2, t6, 2 |
|
vld vr1, t6, 0 |
|
VSTX_QPEL8_V_LOWPASS_LSX vr2, vr3, vr4, vr5, vr6, vr0, vr1, vr25, vr26 |
|
alsl.d a0, a2, a0, 2 // dst = dst + 4 *stride |
|
vldx vr2, t6, a2 |
|
vldx vr3, t6, t1 |
|
VST_QPEL8_V_LOWPASS_LSX vr4, vr5, vr6, vr0, vr1, vr2, vr3, vr27, vr28 |
|
vldx vr4, t6, t2 |
|
alsl.d t6, a2, t6, 2 |
|
vld vr5, t6, 0 |
|
VSTX_QPEL8_V_LOWPASS_LSX vr6, vr0, vr1, vr2, vr3, vr4, vr5, vr29, vr30 |
|
fld.d f24, sp, 0 |
|
fld.d f25, sp, 8 |
|
fld.d f26, sp, 16 |
|
fld.d f27, sp, 24 |
|
fld.d f28, sp, 32 |
|
fld.d f29, sp, 40 |
|
fld.d f30, sp, 48 |
|
fld.d f31, sp, 56 |
|
addi.d sp, sp, 64 |
|
endfunc |
|
|
|
function ff_avg_h264_qpel16_mc10_lsx |
|
addi.d t0, a0, 0 // t0 = dst |
|
addi.d t4, a1, -2 // t1 = src - 2 |
|
addi.d t5, t4, 8 |
|
slli.d t1, a2, 1 |
|
add.d t2, a2, t1 |
|
vldi vr20, 0x414 |
|
vldi vr21, 0x405 |
|
vldi vr22, 0x410 |
|
.rept 2 |
|
VLD_DOUBLE_QPEL8_H_LSX vr12, vr13, vr14, vr15, t4 |
|
alsl.d t4, a2, t4, 2 |
|
VLD_DOUBLE_QPEL8_H_LSX vr16, vr17, vr18, vr19, t4 |
|
VLD_QPEL8_H_SSRANI_LSX vr2, vr3, vr12, vr13, t5 |
|
vld vr0, a1, 0 |
|
vldx vr1, a1, a2 |
|
vld vr12, t0, 0 |
|
vldx vr13, t0, a2 |
|
vavgr.bu vr0, vr0, vr2 |
|
vavgr.bu vr1, vr1, vr3 |
|
vavgr.bu vr0, vr0, vr12 |
|
vavgr.bu vr1, vr1, vr13 |
|
vst vr0, a0, 0 |
|
vstx vr1, a0, a2 |
|
VLDX_QPEL8_H_SSRANI_LSX vr2, vr3, vr14, vr15, t5 |
|
vldx vr0, a1, t1 |
|
vldx vr1, a1, t2 |
|
vldx vr12, t0, t1 |
|
vldx vr13, t0, t2 |
|
vavgr.bu vr0, vr0, vr2 |
|
vavgr.bu vr1, vr1, vr3 |
|
vavgr.bu vr0, vr0, vr12 |
|
vavgr.bu vr1, vr1, vr13 |
|
vstx vr0, a0, t1 |
|
vstx vr1, a0, t2 |
|
alsl.d t5, a2, t5, 2 |
|
alsl.d a1, a2, a1, 2 |
|
alsl.d t0, a2, t0, 2 |
|
alsl.d a0, a2, a0, 2 |
|
VLD_QPEL8_H_SSRANI_LSX vr2, vr3, vr16, vr17, t5 |
|
vld vr0, a1, 0 |
|
vldx vr1, a1, a2 |
|
vld vr12, t0, 0 |
|
vldx vr13, t0, a2 |
|
vavgr.bu vr0, vr0, vr2 |
|
vavgr.bu vr1, vr1, vr3 |
|
vavgr.bu vr0, vr0, vr12 |
|
vavgr.bu vr1, vr1, vr13 |
|
vst vr0, a0, 0 |
|
vstx vr1, a0, a2 |
|
VLDX_QPEL8_H_SSRANI_LSX vr2, vr3, vr18, vr19, t5 |
|
vldx vr0, a1, t1 |
|
vldx vr1, a1, t2 |
|
vldx vr12, t0, t1 |
|
vldx vr13, t0, t2 |
|
vavgr.bu vr0, vr0, vr2 |
|
vavgr.bu vr1, vr1, vr3 |
|
vavgr.bu vr0, vr0, vr12 |
|
vavgr.bu vr1, vr1, vr13 |
|
vstx vr0, a0, t1 |
|
vstx vr1, a0, t2 |
|
alsl.d t5, a2, t5, 2 |
|
alsl.d a1, a2, a1, 2 |
|
alsl.d t0, a2, t0, 2 |
|
alsl.d a0, a2, a0, 2 |
|
alsl.d t4, a2, t4, 2 // src + 8 * stride -2 |
|
.endr |
|
endfunc |
|
|
|
function ff_avg_h264_qpel16_mc30_lsx |
|
addi.d t0, a0, 0 // t0 = dst |
|
addi.d t4, a1, -2 // t1 = src - 2 |
|
addi.d t5, t4, 8 |
|
addi.d a1, a1, 1 // a1 = a1 + 1 |
|
slli.d t1, a2, 1 |
|
add.d t2, a2, t1 |
|
vldi vr20, 0x414 |
|
vldi vr21, 0x405 |
|
vldi vr22, 0x410 |
|
.rept 2 |
|
VLD_DOUBLE_QPEL8_H_LSX vr12, vr13, vr14, vr15, t4 |
|
alsl.d t4, a2, t4, 2 |
|
VLD_DOUBLE_QPEL8_H_LSX vr16, vr17, vr18, vr19, t4 |
|
VLD_QPEL8_H_SSRANI_LSX vr2, vr3, vr12, vr13, t5 |
|
vld vr0, a1, 0 |
|
vldx vr1, a1, a2 |
|
vld vr12, t0, 0 |
|
vldx vr13, t0, a2 |
|
vavgr.bu vr0, vr0, vr2 |
|
vavgr.bu vr1, vr1, vr3 |
|
vavgr.bu vr0, vr0, vr12 |
|
vavgr.bu vr1, vr1, vr13 |
|
vst vr0, a0, 0 |
|
vstx vr1, a0, a2 |
|
VLDX_QPEL8_H_SSRANI_LSX vr2, vr3, vr14, vr15, t5 |
|
vldx vr0, a1, t1 |
|
vldx vr1, a1, t2 |
|
vldx vr12, t0, t1 |
|
vldx vr13, t0, t2 |
|
vavgr.bu vr0, vr0, vr2 |
|
vavgr.bu vr1, vr1, vr3 |
|
vavgr.bu vr0, vr0, vr12 |
|
vavgr.bu vr1, vr1, vr13 |
|
vstx vr0, a0, t1 |
|
vstx vr1, a0, t2 |
|
alsl.d t5, a2, t5, 2 |
|
alsl.d a1, a2, a1, 2 |
|
alsl.d t0, a2, t0, 2 |
|
alsl.d a0, a2, a0, 2 |
|
VLD_QPEL8_H_SSRANI_LSX vr2, vr3, vr16, vr17, t5 |
|
vld vr0, a1, 0 |
|
vldx vr1, a1, a2 |
|
vld vr12, t0, 0 |
|
vldx vr13, t0, a2 |
|
vavgr.bu vr0, vr0, vr2 |
|
vavgr.bu vr1, vr1, vr3 |
|
vavgr.bu vr0, vr0, vr12 |
|
vavgr.bu vr1, vr1, vr13 |
|
vst vr0, a0, 0 |
|
vstx vr1, a0, a2 |
|
VLDX_QPEL8_H_SSRANI_LSX vr2, vr3, vr18, vr19, t5 |
|
vldx vr0, a1, t1 |
|
vldx vr1, a1, t2 |
|
vldx vr12, t0, t1 |
|
vldx vr13, t0, t2 |
|
vavgr.bu vr0, vr0, vr2 |
|
vavgr.bu vr1, vr1, vr3 |
|
vavgr.bu vr0, vr0, vr12 |
|
vavgr.bu vr1, vr1, vr13 |
|
vstx vr0, a0, t1 |
|
vstx vr1, a0, t2 |
|
alsl.d t5, a2, t5, 2 |
|
alsl.d a1, a2, a1, 2 |
|
alsl.d t0, a2, t0, 2 |
|
alsl.d a0, a2, a0, 2 |
|
alsl.d t4, a2, t4, 2 // t1 = src + 8 * stride -2 |
|
.endr |
|
endfunc |
|
|
|
function ff_put_h264_qpel16_mc02_lsx |
|
slli.d t0, a2, 1 |
|
add.d t1, t0, a2 |
|
sub.d t2, a1, t0 // t2 = src - 2 * stride |
|
vldi vr20, 0x414 |
|
vldi vr21, 0x405 |
|
vldi vr22, 0x410 |
|
|
|
vld vr0, t2, 0 |
|
vldx vr1, t2, a2 |
|
vldx vr2, t2, t0 |
|
vldx vr3, t2, t1 |
|
alsl.d t2, a2, t2, 2 // t2 = t2 + 4 * stride |
|
vld vr4, t2, 0 |
|
vldx vr5, t2, a2 |
|
vldx vr6, t2, t0 |
|
QPEL8_V_LSX vr0, vr1, vr2, vr3, vr4, vr5, vr6 |
|
vst vr13, a0, 0 |
|
vstx vr14, a0, a2 |
|
vldx vr0, t2, t1 |
|
alsl.d t2, a2, t2, 2 // t2 = t2 + 4 *stride |
|
vld vr1, t2, 0 |
|
QPEL8_V_LSX vr2, vr3, vr4, vr5, vr6, vr0, vr1 |
|
vstx vr13, a0, t0 |
|
vstx vr14, a0, t1 |
|
alsl.d a0, a2, a0, 2 // dst = dst + 4 * stride |
|
vldx vr2, t2, a2 |
|
vldx vr3, t2, t0 |
|
QPEL8_V_LSX vr4, vr5, vr6, vr0, vr1, vr2, vr3 |
|
vst vr13, a0, 0 |
|
vstx vr14, a0, a2 |
|
vldx vr4, t2, t1 |
|
alsl.d t2, a2, t2, 2 // t2 = t2 + 4 * stride |
|
vld vr5, t2, 0 |
|
QPEL8_V_LSX vr6, vr0, vr1, vr2, vr3, vr4, vr5 |
|
vstx vr13, a0, t0 |
|
vstx vr14, a0, t1 |
|
|
|
alsl.d a0, a2, a0, 2 // dst = dst + 4 * stride |
|
|
|
vldx vr6, t2, a2 |
|
vldx vr0, t2, t0 |
|
QPEL8_V_LSX vr1, vr2, vr3, vr4, vr5, vr6, vr0 |
|
vst vr13, a0, 0 |
|
vstx vr14, a0, a2 |
|
vldx vr1, t2, t1 |
|
alsl.d t2, a2, t2, 2 // t2 = t2 + 4 * stride |
|
vld vr2, t2, 0 |
|
QPEL8_V_LSX vr3, vr4, vr5, vr6, vr0, vr1, vr2 |
|
vstx vr13, a0, t0 |
|
vstx vr14, a0, t1 |
|
alsl.d a0, a2, a0, 2 // dst = dst + 4 * stride |
|
vldx vr3, t2, a2 |
|
vldx vr4, t2, t0 |
|
QPEL8_V_LSX vr5, vr6, vr0, vr1, vr2, vr3, vr4 |
|
vst vr13, a0, 0 |
|
vstx vr14, a0, a2 |
|
vldx vr5, t2, t1 |
|
alsl.d t2, a2, t2, 2 // t2 = t2 + 4 * stride |
|
vld vr6, t2, 0 |
|
QPEL8_V_LSX vr0, vr1, vr2, vr3, vr4, vr5, vr6 |
|
vstx vr13, a0, t0 |
|
vstx vr14, a0, t1 |
|
endfunc |
|
|
|
.macro avc_luma_hv_qrt_and_aver_dst_16x16_lsx |
|
addi.d sp, sp, -64 |
|
fst.d f24, sp, 0 |
|
fst.d f25, sp, 8 |
|
fst.d f26, sp, 16 |
|
fst.d f27, sp, 24 |
|
fst.d f28, sp, 32 |
|
fst.d f29, sp, 40 |
|
fst.d f30, sp, 48 |
|
fst.d f31, sp, 56 |
|
vldi vr20, 0x414 |
|
vldi vr21, 0x405 |
|
vldi vr22, 0x410 |
|
|
|
VLD_DOUBLE_QPEL8_H_LSX vr12, vr13, vr14, vr15, t0 |
|
alsl.d a1, a2, t0, 2 |
|
VLD_DOUBLE_QPEL8_H_LSX vr16, vr17, vr18, vr19, a1 |
|
addi.d a1, t0, 8 |
|
VLD_DOUBLE_QPEL8_H_SSRANI_LSX vr23, vr24, vr25, vr26, vr12, vr13, \ |
|
vr14, vr15, a1 |
|
alsl.d a1, a2, a1, 2 |
|
VLD_DOUBLE_QPEL8_H_SSRANI_LSX vr27, vr28, vr29, vr30, vr16, vr17, \ |
|
vr18, vr19, a1 |
|
vld vr0, t4, 0 // t4 = src - 2 * stride + 1 |
|
vldx vr1, t4, a2 |
|
vldx vr2, t4, t1 |
|
vldx vr3, t4, t2 |
|
alsl.d t4, a2, t4, 2 |
|
vld vr4, t4, 0 |
|
vldx vr5, t4, a2 |
|
vldx vr6, t4, t1 |
|
QPEL8_V_LSX vr0, vr1, vr2, vr3, vr4, vr5, vr6 |
|
vld vr0, t8, 0 |
|
vldx vr1, t8, a2 |
|
vavgr.bu vr13, vr23, vr13 |
|
vavgr.bu vr14, vr24, vr14 |
|
vavgr.bu vr13, vr13, vr0 |
|
vavgr.bu vr14, vr14, vr1 |
|
vst vr13, a0, 0 |
|
vstx vr14, a0, a2 |
|
vldx vr0, t4, t2 |
|
alsl.d t4, a2, t4, 2 |
|
vld vr1, t4, 0 |
|
QPEL8_V_LSX vr2, vr3, vr4, vr5, vr6, vr0, vr1 |
|
vldx vr2, t8, t1 |
|
vldx vr3, t8, t2 |
|
vavgr.bu vr13, vr25, vr13 |
|
vavgr.bu vr14, vr26, vr14 |
|
vavgr.bu vr13, vr13, vr2 |
|
vavgr.bu vr14, vr14, vr3 |
|
add.d t6, t4, zero // t6 = src + 6 * stride |
|
vstx vr13, a0, t1 |
|
vstx vr14, a0, t2 |
|
alsl.d a0, a2, a0, 2 // dst = dst + 4 * stride |
|
alsl.d t8, a2, t8, 2 |
|
vldx vr2, t4, a2 |
|
vldx vr3, t4, t1 |
|
QPEL8_V_LSX vr4, vr5, vr6, vr0, vr1, vr2, vr3 |
|
vld vr4, t8, 0 |
|
vldx vr5, t8, a2 |
|
vavgr.bu vr13, vr27, vr13 |
|
vavgr.bu vr14, vr28, vr14 |
|
vavgr.bu vr13, vr13, vr4 |
|
vavgr.bu vr14, vr14, vr5 |
|
vst vr13, a0, 0 |
|
vstx vr14, a0, a2 |
|
vldx vr4, t4, t2 |
|
alsl.d t4, a2, t4, 2 |
|
vld vr5, t4, 0 |
|
QPEL8_V_LSX vr6, vr0, vr1, vr2, vr3, vr4, vr5 |
|
vldx vr6, t8, t1 |
|
vldx vr0, t8, t2 |
|
vavgr.bu vr13, vr29, vr13 |
|
vavgr.bu vr14, vr30, vr14 |
|
vavgr.bu vr13, vr13, vr6 |
|
vavgr.bu vr14, vr14, vr0 |
|
vstx vr13, a0, t1 |
|
vstx vr14, a0, t2 |
|
alsl.d a1, a2, t0, 3 // a1 = src + 8 * stride |
|
addi.d t5, a1, 8 // a1 = src + 8 * stride + 8 |
|
VLD_DOUBLE_QPEL8_H_LSX vr12, vr13, vr14, vr15, a1 |
|
alsl.d a1, a2, a1, 2 |
|
VLD_DOUBLE_QPEL8_H_LSX vr16, vr17, vr18, vr19, a1 |
|
VLD_DOUBLE_QPEL8_H_SSRANI_LSX vr23, vr24, vr25, vr26, vr12, vr13, \ |
|
vr14, vr15, t5 |
|
alsl.d t5, a2, t5, 2 |
|
VLD_DOUBLE_QPEL8_H_SSRANI_LSX vr27, vr28, vr29, vr30, vr16, vr17, \ |
|
vr18, vr19, t5 |
|
alsl.d a0, a2, a0, 2 // dst = dst + 4 * stride |
|
alsl.d t8, a2, t8, 2 |
|
// t6 = src + 6 * stride + 1 |
|
vld vr0, t6, 0 |
|
vldx vr1, t6, a2 |
|
vldx vr2, t6, t1 |
|
vldx vr3, t6, t2 |
|
alsl.d t6, a2, t6, 2 |
|
vld vr4, t6, 0 |
|
vldx vr5, t6, a2 |
|
vldx vr6, t6, t1 |
|
QPEL8_V_LSX vr0, vr1, vr2, vr3, vr4, vr5, vr6 |
|
vld vr0, t8, 0 |
|
vldx vr1, t8, a2 |
|
vavgr.bu vr13, vr23, vr13 |
|
vavgr.bu vr14, vr24, vr14 |
|
vavgr.bu vr13, vr13, vr0 |
|
vavgr.bu vr14, vr14, vr1 |
|
vst vr13, a0, 0 |
|
vstx vr14, a0, a2 |
|
vldx vr0, t6, t2 |
|
alsl.d t6, a2, t6, 2 |
|
vld vr1, t6, 0 |
|
QPEL8_V_LSX vr2, vr3, vr4, vr5, vr6, vr0, vr1 |
|
vldx vr2, t8, t1 |
|
vldx vr3, t8, t2 |
|
vavgr.bu vr13, vr25, vr13 |
|
vavgr.bu vr14, vr26, vr14 |
|
vavgr.bu vr13, vr13, vr2 |
|
vavgr.bu vr14, vr14, vr3 |
|
vstx vr13, a0, t1 |
|
vstx vr14, a0, t2 |
|
alsl.d a0, a2, a0, 2 // dst = dst + 4 *stride |
|
alsl.d t8, a2, t8, 2 |
|
vldx vr2, t6, a2 |
|
vldx vr3, t6, t1 |
|
QPEL8_V_LSX vr4, vr5, vr6, vr0, vr1, vr2, vr3 |
|
vld vr4, t8, 0 |
|
vldx vr5, t8, a2 |
|
vavgr.bu vr13, vr27, vr13 |
|
vavgr.bu vr14, vr28, vr14 |
|
vavgr.bu vr13, vr13, vr4 |
|
vavgr.bu vr14, vr14, vr5 |
|
vst vr13, a0, 0 |
|
vstx vr14, a0, a2 |
|
vldx vr4, t6, t2 |
|
alsl.d t6, a2, t6, 2 |
|
vld vr5, t6, 0 |
|
QPEL8_V_LSX vr6, vr0, vr1, vr2, vr3, vr4, vr5 |
|
vldx vr6, t8, t1 |
|
vldx vr0, t8, t2 |
|
vavgr.bu vr13, vr29, vr13 |
|
vavgr.bu vr14, vr30, vr14 |
|
vavgr.bu vr13, vr13, vr6 |
|
vavgr.bu vr14, vr14, vr0 |
|
vstx vr13, a0, t1 |
|
vstx vr14, a0, t2 |
|
fld.d f24, sp, 0 |
|
fld.d f25, sp, 8 |
|
fld.d f26, sp, 16 |
|
fld.d f27, sp, 24 |
|
fld.d f28, sp, 32 |
|
fld.d f29, sp, 40 |
|
fld.d f30, sp, 48 |
|
fld.d f31, sp, 56 |
|
addi.d sp, sp, 64 |
|
.endm |
|
|
|
function ff_avg_h264_qpel16_mc33_lsx |
|
slli.d t1, a2, 1 |
|
add.d t2, t1, a2 |
|
addi.d t0, a1, -2 // t0 = src - 2 |
|
add.d t0, t0, a2 // t0 = src + stride - 2 |
|
add.d t3, a1, zero // t3 = src |
|
sub.d t4, a1, t1 // t4 = src - 2 * stride |
|
addi.d t4, t4, 1 |
|
addi.d t8, a0, 0 |
|
avc_luma_hv_qrt_and_aver_dst_16x16_lsx |
|
endfunc |
|
|
|
function ff_avg_h264_qpel16_mc11_lsx |
|
slli.d t1, a2, 1 |
|
add.d t2, t1, a2 |
|
addi.d t0, a1, -2 // t0 = src - 2 |
|
add.d t3, a1, zero // t3 = src |
|
sub.d t4, a1, t1 // t4 = src - 2 * stride |
|
addi.d t8, a0, 0 |
|
avc_luma_hv_qrt_and_aver_dst_16x16_lsx |
|
endfunc |
|
|
|
function ff_avg_h264_qpel16_mc31_lsx |
|
slli.d t1, a2, 1 |
|
add.d t2, t1, a2 |
|
addi.d t0, a1, -2 // t0 = src - 2 |
|
add.d t3, a1, zero // t3 = src |
|
sub.d t4, a1, t1 // t4 = src - 2 * stride |
|
addi.d t4, t4, 1 |
|
addi.d t8, a0, 0 |
|
avc_luma_hv_qrt_and_aver_dst_16x16_lsx |
|
endfunc |
|
|
|
function ff_avg_h264_qpel16_mc13_lsx |
|
slli.d t1, a2, 1 |
|
add.d t2, t1, a2 |
|
addi.d t0, a1, -2 // t0 = src - 2 |
|
add.d t0, t0, a2 |
|
add.d t3, a1, zero // t3 = src |
|
sub.d t4, a1, t1 // t4 = src - 2 * stride |
|
addi.d t8, a0, 0 |
|
avc_luma_hv_qrt_and_aver_dst_16x16_lsx |
|
endfunc |
|
|
|
function ff_avg_h264_qpel16_mc20_lsx |
|
slli.d t1, a2, 1 |
|
add.d t2, t1, a2 |
|
vldi vr20, 0x414 |
|
vldi vr21, 0x405 |
|
vldi vr22, 0x410 |
|
addi.d t0, a1, -2 // t0 = src - 2 |
|
addi.d t5, a0, 0 |
|
addi.d a1, t0, 8 |
|
.rept 4 |
|
VLD_DOUBLE_QPEL8_H_LSX vr12, vr13, vr14, vr15, t0 |
|
VLD_QPEL8_H_SSRANI_LSX vr2, vr3, vr12, vr13, a1 |
|
vld vr0, t5, 0 |
|
vldx vr1, t5, a2 |
|
vavgr.bu vr0, vr0, vr2 |
|
vavgr.bu vr1, vr1, vr3 |
|
vst vr0, a0, 0 |
|
vstx vr1, a0, a2 |
|
add.d a1, a1, t1 |
|
VLD_QPEL8_H_SSRANI_LSX vr2, vr3, vr14, vr15, a1 |
|
vldx vr0, t5, t1 |
|
vldx vr1, t5, t2 |
|
vavgr.bu vr0, vr0, vr2 |
|
vavgr.bu vr1, vr1, vr3 |
|
vstx vr0, a0, t1 |
|
vstx vr1, a0, t2 |
|
alsl.d t0, a2, t0, 2 |
|
alsl.d t5, a2, t5, 2 |
|
alsl.d a0, a2, a0, 2 |
|
alsl.d a1, a2, a1, 1 |
|
.endr |
|
endfunc |
|
|
|
.macro QPEL8_HV_H_LSX out0, out1 |
|
vbsrl.v vr2, vr0, 1 |
|
vbsrl.v vr3, vr1, 1 |
|
vbsrl.v vr4, vr0, 2 |
|
vbsrl.v vr5, vr1, 2 |
|
vbsrl.v vr6, vr0, 3 |
|
vbsrl.v vr7, vr1, 3 |
|
vbsrl.v vr8, vr0, 4 |
|
vbsrl.v vr9, vr1, 4 |
|
vbsrl.v vr10, vr0, 5 |
|
vbsrl.v vr11, vr1, 5 |
|
vilvl.b vr6, vr4, vr6 |
|
vilvl.b vr7, vr5, vr7 |
|
vilvl.b vr8, vr2, vr8 |
|
vilvl.b vr9, vr3, vr9 |
|
vilvl.b vr10, vr0, vr10 |
|
vilvl.b vr11, vr1, vr11 |
|
vhaddw.hu.bu vr6, vr6, vr6 |
|
vhaddw.hu.bu vr7, vr7, vr7 |
|
vhaddw.hu.bu vr8, vr8, vr8 |
|
vhaddw.hu.bu vr9, vr9, vr9 |
|
vhaddw.hu.bu vr10, vr10, vr10 |
|
vhaddw.hu.bu vr11, vr11, vr11 |
|
vmul.h vr2, vr6, vr20 |
|
vmul.h vr3, vr7, vr20 |
|
vmul.h vr4, vr8, vr21 |
|
vmul.h vr5, vr9, vr21 |
|
vssub.h vr2, vr2, vr4 |
|
vssub.h vr3, vr3, vr5 |
|
vsadd.h \out0, vr2, vr10 |
|
vsadd.h \out1, vr3, vr11 |
|
.endm |
|
|
|
.macro QPEL8_HV_V_LSX in0, in1, in2, in3, in4, in5, in6, out0, out1, out2, out3 |
|
vilvl.h vr0, \in2, \in3 |
|
vilvl.h vr1, \in3, \in4 // tmp0 |
|
vilvl.h vr2, \in1, \in4 |
|
vilvl.h vr3, \in2, \in5 // tmp2 |
|
vilvl.h vr4, \in0, \in5 |
|
vilvl.h vr5, \in1, \in6 // tmp4 |
|
vhaddw.w.h vr0, vr0, vr0 |
|
vhaddw.w.h vr1, vr1, vr1 |
|
vhaddw.w.h vr2, vr2, vr2 |
|
vhaddw.w.h vr3, vr3, vr3 |
|
vhaddw.w.h vr4, vr4, vr4 |
|
vhaddw.w.h vr5, vr5, vr5 |
|
vmul.w vr0, vr0, vr22 |
|
vmul.w vr1, vr1, vr22 |
|
vmul.w vr2, vr2, vr23 |
|
vmul.w vr3, vr3, vr23 |
|
vssub.w vr0, vr0, vr2 |
|
vssub.w vr1, vr1, vr3 |
|
vsadd.w vr0, vr0, vr4 |
|
vsadd.w vr1, vr1, vr5 |
|
vsadd.w \out0, vr0, vr24 |
|
vsadd.w \out1, vr1, vr24 |
|
vilvh.h vr0, \in2, \in3 |
|
vilvh.h vr1, \in3, \in4 // tmp0 |
|
vilvh.h vr2, \in1, \in4 |
|
vilvh.h vr3, \in2, \in5 // tmp2 |
|
vilvh.h vr4, \in0, \in5 |
|
vilvh.h vr5, \in1, \in6 // tmp4 |
|
vhaddw.w.h vr0, vr0, vr0 |
|
vhaddw.w.h vr1, vr1, vr1 |
|
vhaddw.w.h vr2, vr2, vr2 |
|
vhaddw.w.h vr3, vr3, vr3 |
|
vhaddw.w.h vr4, vr4, vr4 |
|
vhaddw.w.h vr5, vr5, vr5 |
|
vmul.w vr0, vr0, vr22 |
|
vmul.w vr1, vr1, vr22 |
|
vmul.w vr2, vr2, vr23 |
|
vmul.w vr3, vr3, vr23 |
|
vssub.w vr0, vr0, vr2 |
|
vssub.w vr1, vr1, vr3 |
|
vsadd.w vr0, vr0, vr4 |
|
vsadd.w vr1, vr1, vr5 |
|
vsadd.w \out2, vr0, vr24 |
|
vsadd.w \out3, vr1, vr24 |
|
vssrani.hu.w \out2, \out0, 10 |
|
vssrani.hu.w \out3, \out1, 10 |
|
vssrani.bu.h \out3, \out2, 0 |
|
.endm |
|
|
|
.macro h264_qpel8_hv_lowpass_core_lsx in0, in1, type |
|
vld vr0, \in0, 0 |
|
vldx vr1, \in0, a3 |
|
QPEL8_HV_H_LSX vr12, vr13 // a b$ |
|
vldx vr0, \in0, t1 |
|
vldx vr1, \in0, t2 |
|
QPEL8_HV_H_LSX vr14, vr15 // c d$ |
|
|
|
alsl.d \in0, a3, \in0, 2 |
|
|
|
vld vr0, \in0, 0 |
|
vldx vr1, \in0, a3 |
|
QPEL8_HV_H_LSX vr16, vr17 // e f$ |
|
vldx vr0, \in0, t1 |
|
vldx vr1, \in0, t2 |
|
QPEL8_HV_H_LSX vr18, vr19 // g h$ |
|
QPEL8_HV_V_LSX vr12, vr13, vr14, vr15, vr16, vr17, vr18, vr6, vr7, vr0, vr1 |
|
.ifc \type, avg |
|
fld.d f2, t3, 0 |
|
fldx.d f3, t3, a2 |
|
vilvl.d vr2, vr3, vr2 |
|
vavgr.bu vr1, vr2, vr1 |
|
.endif |
|
vstelm.d vr1, \in1, 0, 0 |
|
add.d \in1, \in1, a2 |
|
vstelm.d vr1, \in1, 0, 1 |
|
|
|
alsl.d \in0, a3, \in0, 2 |
|
|
|
// tmp8 |
|
vld vr0, \in0, 0 |
|
vldx vr1, \in0, a3 |
|
QPEL8_HV_H_LSX vr12, vr13 |
|
QPEL8_HV_V_LSX vr14, vr15, vr16, vr17, vr18, vr19, vr12, vr6, vr7, vr0, vr1 |
|
.ifc \type, avg |
|
fldx.d f2, t3, t5 |
|
fldx.d f3, t3, t6 |
|
vilvl.d vr2, vr3, vr2 |
|
vavgr.bu vr1, vr2, vr1 |
|
.endif |
|
add.d \in1, \in1, a2 |
|
vstelm.d vr1, \in1, 0, 0 |
|
add.d \in1, \in1, a2 |
|
vstelm.d vr1, \in1, 0, 1 |
|
|
|
// tmp10 |
|
vldx vr0, \in0, t1 |
|
vldx vr1, \in0, t2 |
|
QPEL8_HV_H_LSX vr14, vr15 |
|
QPEL8_HV_V_LSX vr16, vr17, vr18, vr19, vr12, vr13, vr14, vr6, vr7, vr0, vr1 |
|
.ifc \type, avg |
|
alsl.d t3, a2, t3, 2 |
|
fld.d f2, t3, 0 |
|
fldx.d f3, t3, a2 |
|
vilvl.d vr2, vr3, vr2 |
|
vavgr.bu vr1, vr2, vr1 |
|
.endif |
|
add.d \in1, \in1, a2 |
|
vstelm.d vr1, \in1, 0, 0 |
|
add.d \in1, \in1, a2 |
|
vstelm.d vr1, \in1, 0, 1 |
|
|
|
// tmp12 |
|
alsl.d \in0, a3, \in0, 2 |
|
|
|
vld vr0, \in0, 0 |
|
vldx vr1, \in0, a3 |
|
QPEL8_HV_H_LSX vr16, vr17 |
|
QPEL8_HV_V_LSX vr18, vr19, vr12, vr13, vr14, vr15, vr16, vr6, vr7, vr0, vr1 |
|
.ifc \type, avg |
|
fldx.d f2, t3, t5 |
|
fldx.d f3, t3, t6 |
|
vilvl.d vr2, vr3, vr2 |
|
vavgr.bu vr1, vr2, vr1 |
|
.endif |
|
add.d \in1, \in1, a2 |
|
vstelm.d vr1, \in1, 0, 0 |
|
add.d \in1, \in1, a2 |
|
vstelm.d vr1, \in1, 0, 1 |
|
.endm |
|
|
|
function put_h264_qpel8_hv_lowpass_lsx |
|
slli.d t1, a3, 1 |
|
add.d t2, t1, a3 |
|
addi.d sp, sp, -8 |
|
fst.d f24, sp, 0 |
|
addi.d t0, a1, -2 // t0 = src - 2 |
|
sub.d t0, t0, t1 // t0 = t0 - 2 * stride |
|
vldi vr20, 0x414 // h_20 |
|
vldi vr21, 0x405 // h_5 |
|
vldi vr22, 0x814 // w_20 |
|
vldi vr23, 0x805 // w_5 |
|
addi.d t4, zero, 512 |
|
vreplgr2vr.w vr24, t4 // w_512 |
|
h264_qpel8_hv_lowpass_core_lsx t0, a0, put |
|
fld.d f24, sp, 0 |
|
addi.d sp, sp, 8 |
|
endfunc |
|
|
|
function put_h264_qpel8_h_lowpass_lsx |
|
slli.d t1, a3, 1 |
|
add.d t2, t1, a3 |
|
vldi vr20, 0x414 |
|
vldi vr21, 0x405 |
|
vldi vr22, 0x410 |
|
addi.d t0, a1, -2 // t0 = src - 2 |
|
add.d t3, a1, zero // t3 = src |
|
.rept 2 |
|
vld vr0, t0, 0 |
|
vldx vr1, t0, a3 |
|
QPEL8_H_LSX vr12, vr13 |
|
vssrani.bu.h vr13, vr12, 5 |
|
vstelm.d vr13, a0, 0, 0 |
|
add.d a0, a0, a2 |
|
vstelm.d vr13, a0, 0, 1 |
|
add.d a0, a0, a2 |
|
vldx vr0, t0, t1 |
|
vldx vr1, t0, t2 |
|
QPEL8_H_LSX vr12, vr13 |
|
vssrani.bu.h vr13, vr12, 5 |
|
vstelm.d vr13, a0, 0, 0 |
|
add.d a0, a0, a2 |
|
vstelm.d vr13, a0, 0, 1 |
|
add.d a0, a0, a2 |
|
alsl.d t0, a3, t0, 2 |
|
.endr |
|
endfunc |
|
|
|
function put_pixels16_l2_8_lsx |
|
slli.d t0, a4, 1 |
|
add.d t1, t0, a4 |
|
slli.d t2, t0, 1 |
|
slli.d t3, a3, 1 |
|
add.d t4, t3, a3 |
|
slli.d t5, t3, 1 |
|
.rept 4 |
|
vld vr0, a1, 0 |
|
vldx vr1, a1, a4 |
|
vldx vr2, a1, t0 |
|
vldx vr3, a1, t1 |
|
add.d a1, a1, t2 |
|
vld vr8, a2, 0x00 |
|
vld vr9, a2, 0x10 |
|
vld vr10, a2, 0x20 |
|
vld vr11, a2, 0x30 |
|
addi.d a2, a2, 0x40 |
|
vavgr.bu vr0, vr8, vr0 |
|
vavgr.bu vr1, vr9, vr1 |
|
vavgr.bu vr2, vr10, vr2 |
|
vavgr.bu vr3, vr11, vr3 |
|
vst vr0, a0, 0 |
|
vstx vr1, a0, a3 |
|
vstx vr2, a0, t3 |
|
vstx vr3, a0, t4 |
|
add.d a0, a0, t5 |
|
.endr |
|
endfunc |
|
|
|
.macro QPEL8_V1_LSX in0, in1, in2, in3, in4, in5, in6 |
|
vilvl.b vr7, \in3, \in2 |
|
vilvl.b vr8, \in4, \in3 |
|
vilvl.b vr9, \in4, \in1 |
|
vilvl.b vr10, \in5, \in2 |
|
vilvl.b vr11, \in5, \in0 |
|
vilvl.b vr12, \in6, \in1 |
|
vhaddw.hu.bu vr7, vr7, vr7 |
|
vhaddw.hu.bu vr8, vr8, vr8 |
|
vhaddw.hu.bu vr9, vr9, vr9 |
|
vhaddw.hu.bu vr10, vr10, vr10 |
|
vhaddw.hu.bu vr11, vr11, vr11 |
|
vhaddw.hu.bu vr12, vr12, vr12 |
|
vmul.h vr7, vr7, vr20 |
|
vmul.h vr8, vr8, vr20 |
|
vmul.h vr9, vr9, vr21 |
|
vmul.h vr10, vr10, vr21 |
|
vssub.h vr7, vr7, vr9 |
|
vssub.h vr8, vr8, vr10 |
|
vsadd.h vr7, vr7, vr11 |
|
vsadd.h vr8, vr8, vr12 |
|
vsadd.h vr7, vr7, vr22 |
|
vsadd.h vr8, vr8, vr22 |
|
vssrani.bu.h vr8, vr7, 5 |
|
.endm |
|
|
|
.macro h264_qpel8_v_lowpass_lsx type |
|
function \type\()_h264_qpel8_v_lowpass_lsx |
|
slli.d t0, a3, 1 |
|
add.d t1, t0, a3 |
|
sub.d t2, a1, t0 // t2 = src - 2 * stride |
|
.ifc \type, avg |
|
addi.d t3, a0, 0 |
|
slli.d t4, a2, 1 |
|
add.d t5, t4, a2 |
|
.endif |
|
vldi vr20, 0x414 |
|
vldi vr21, 0x405 |
|
vldi vr22, 0x410 |
|
|
|
fld.d f0, t2, 0 |
|
fldx.d f1, t2, a3 |
|
fldx.d f2, t2, t0 |
|
fldx.d f3, t2, t1 |
|
alsl.d t2, a3, t2, 2 // t2 = t2 + 4 * stride |
|
fld.d f4, t2, 0 |
|
fldx.d f5, t2, a3 |
|
fldx.d f6, t2, t0 |
|
QPEL8_V1_LSX vr0, vr1, vr2, vr3, vr4, vr5, vr6 |
|
.ifc \type, avg |
|
fld.d f0, t3, 0 |
|
fldx.d f1, t3, a2 |
|
vilvl.d vr0, vr1, vr0 |
|
vavgr.bu vr8, vr8, vr0 |
|
.endif |
|
vstelm.d vr8, a0, 0, 0 |
|
add.d a0, a0, a2 |
|
vstelm.d vr8, a0, 0, 1 |
|
add.d a0, a0, a2 |
|
|
|
fldx.d f0, t2, t1 |
|
alsl.d t2, a3, t2, 2 // t2 = t2 + 4 *stride |
|
fld.d f1, t2, 0 |
|
QPEL8_V1_LSX vr2, vr3, vr4, vr5, vr6, vr0, vr1 |
|
.ifc \type, avg |
|
fldx.d f2, t3, t4 |
|
fldx.d f3, t3, t5 |
|
vilvl.d vr2, vr3, vr2 |
|
vavgr.bu vr8, vr8, vr2 |
|
.endif |
|
vstelm.d vr8, a0, 0, 0 |
|
add.d a0, a0, a2 |
|
vstelm.d vr8, a0, 0, 1 |
|
add.d a0, a0, a2 |
|
|
|
alsl.d t3, a2, t3, 2 |
|
|
|
fldx.d f2, t2, a3 |
|
fldx.d f3, t2, t0 |
|
QPEL8_V1_LSX vr4, vr5, vr6, vr0, vr1, vr2, vr3 |
|
.ifc \type, avg |
|
fld.d f4, t3, 0 |
|
fldx.d f5, t3, a2 |
|
vilvl.d vr4, vr5, vr4 |
|
vavgr.bu vr8, vr8, vr4 |
|
.endif |
|
vstelm.d vr8, a0, 0, 0 |
|
add.d a0, a0, a2 |
|
vstelm.d vr8, a0, 0, 1 |
|
add.d a0, a0, a2 |
|
|
|
fldx.d f4, t2, t1 |
|
alsl.d t2, a3, t2, 2 // t2 = t2 + 4 * stride |
|
fld.d f5, t2, 0 |
|
QPEL8_V1_LSX vr6, vr0, vr1, vr2, vr3, vr4, vr5 |
|
.ifc \type, avg |
|
fldx.d f6, t3, t4 |
|
fldx.d f0, t3, t5 |
|
vilvl.d vr6, vr0, vr6 |
|
vavgr.bu vr8, vr8, vr6 |
|
.endif |
|
vstelm.d vr8, a0, 0, 0 |
|
add.d a0, a0, a2 |
|
vstelm.d vr8, a0, 0, 1 |
|
endfunc |
|
.endm |
|
|
|
h264_qpel8_v_lowpass_lsx put |
|
h264_qpel8_v_lowpass_lsx avg |
|
|
|
function avg_pixels16_l2_8_lsx |
|
slli.d t0, a4, 1 |
|
add.d t1, t0, a4 |
|
slli.d t2, t0, 1 |
|
slli.d t3, a3, 1 |
|
add.d t4, t3, a3 |
|
slli.d t5, t3, 1 |
|
addi.d t6, a0, 0 |
|
.rept 4 |
|
vld vr0, a1, 0 |
|
vldx vr1, a1, a4 |
|
vldx vr2, a1, t0 |
|
vldx vr3, a1, t1 |
|
add.d a1, a1, t2 |
|
vld vr8, a2, 0x00 |
|
vld vr9, a2, 0x10 |
|
vld vr10, a2, 0x20 |
|
vld vr11, a2, 0x30 |
|
addi.d a2, a2, 0x40 |
|
vavgr.bu vr0, vr8, vr0 |
|
vavgr.bu vr1, vr9, vr1 |
|
vavgr.bu vr2, vr10, vr2 |
|
vavgr.bu vr3, vr11, vr3 |
|
vld vr8, t6, 0 |
|
vldx vr9, t6, a3 |
|
vldx vr10, t6, t3 |
|
vldx vr11, t6, t4 |
|
add.d t6, t6, t5 |
|
vavgr.bu vr0, vr8, vr0 |
|
vavgr.bu vr1, vr9, vr1 |
|
vavgr.bu vr2, vr10, vr2 |
|
vavgr.bu vr3, vr11, vr3 |
|
vst vr0, a0, 0 |
|
vstx vr1, a0, a3 |
|
vstx vr2, a0, t3 |
|
vstx vr3, a0, t4 |
|
add.d a0, a0, t5 |
|
.endr |
|
endfunc |
|
|
|
function avg_h264_qpel8_hv_lowpass_lsx |
|
slli.d t1, a3, 1 |
|
add.d t2, t1, a3 |
|
slli.d t5, a2, 1 |
|
add.d t6, a2, t5 |
|
addi.d sp, sp, -8 |
|
fst.d f24, sp, 0 |
|
vldi vr20, 0x414 // h_20 |
|
vldi vr21, 0x405 // h_5 |
|
vldi vr22, 0x814 // w_20 |
|
vldi vr23, 0x805 // w_5 |
|
addi.d t4, zero, 512 |
|
vreplgr2vr.w vr24, t4 // w_512 |
|
addi.d t0, a1, -2 // t0 = src - 2 |
|
sub.d t0, t0, t1 // t0 = t0 - 2 * stride |
|
addi.d t3, a0, 0 // t3 = dst |
|
h264_qpel8_hv_lowpass_core_lsx t0, a0, avg |
|
fld.d f24, sp, 0 |
|
addi.d sp, sp, 8 |
|
endfunc |
|
|
|
function put_pixels8_l2_8_lsx |
|
slli.d t0, a4, 1 |
|
add.d t1, t0, a4 |
|
slli.d t2, t0, 1 |
|
.rept 2 |
|
vld vr0, a1, 0 |
|
vldx vr1, a1, a4 |
|
vldx vr2, a1, t0 |
|
vldx vr3, a1, t1 |
|
add.d a1, a1, t2 |
|
vilvl.d vr0, vr1, vr0 |
|
vilvl.d vr2, vr3, vr2 |
|
vld vr8, a2, 0x00 |
|
vld vr9, a2, 0x08 |
|
vld vr10, a2, 0x10 |
|
vld vr11, a2, 0x18 |
|
vilvl.d vr8, vr9, vr8 |
|
vilvl.d vr10, vr11, vr10 |
|
addi.d a2, a2, 32 |
|
vavgr.bu vr0, vr8, vr0 |
|
vavgr.bu vr2, vr10, vr2 |
|
vstelm.d vr0, a0, 0, 0 |
|
add.d a0, a0, a3 |
|
vstelm.d vr0, a0, 0, 1 |
|
add.d a0, a0, a3 |
|
vstelm.d vr2, a0, 0, 0 |
|
add.d a0, a0, a3 |
|
vstelm.d vr2, a0, 0, 1 |
|
add.d a0, a0, a3 |
|
.endr |
|
endfunc |
|
|
|
function ff_put_h264_qpel8_mc00_lsx |
|
slli.d t0, a2, 1 |
|
add.d t1, t0, a2 |
|
slli.d t2, t0, 1 |
|
ld.d t3, a1, 0x0 |
|
ldx.d t4, a1, a2 |
|
ldx.d t5, a1, t0 |
|
ldx.d t6, a1, t1 |
|
st.d t3, a0, 0x0 |
|
stx.d t4, a0, a2 |
|
stx.d t5, a0, t0 |
|
stx.d t6, a0, t1 |
|
add.d a1, a1, t2 |
|
add.d a0, a0, t2 |
|
ld.d t3, a1, 0x0 |
|
ldx.d t4, a1, a2 |
|
ldx.d t5, a1, t0 |
|
ldx.d t6, a1, t1 |
|
st.d t3, a0, 0x0 |
|
stx.d t4, a0, a2 |
|
stx.d t5, a0, t0 |
|
stx.d t6, a0, t1 |
|
endfunc |
|
|
|
function ff_avg_h264_qpel8_mc00_lsx |
|
slli.d t0, a2, 1 |
|
add.d t1, t0, a2 |
|
slli.d t2, t0, 1 |
|
addi.d t3, a0, 0 |
|
.rept 2 |
|
vld vr0, a1, 0 |
|
vldx vr1, a1, a2 |
|
vldx vr2, a1, t0 |
|
vldx vr3, a1, t1 |
|
add.d a1, a1, t2 |
|
vilvl.d vr0, vr1, vr0 |
|
vilvl.d vr2, vr3, vr2 |
|
vld vr8, t3, 0 |
|
vldx vr9, t3, a2 |
|
vldx vr10, t3, t0 |
|
vldx vr11, t3, t1 |
|
add.d t3, t3, t2 |
|
vilvl.d vr8, vr9, vr8 |
|
vilvl.d vr10, vr11, vr10 |
|
vavgr.bu vr0, vr8, vr0 |
|
vavgr.bu vr2, vr10, vr2 |
|
vstelm.d vr0, a0, 0, 0 |
|
add.d a0, a0, a2 |
|
vstelm.d vr0, a0, 0, 1 |
|
add.d a0, a0, a2 |
|
vstelm.d vr2, a0, 0, 0 |
|
add.d a0, a0, a2 |
|
vstelm.d vr2, a0, 0, 1 |
|
add.d a0, a0, a2 |
|
.endr |
|
endfunc |
|
|
|
function avg_pixels8_l2_8_lsx |
|
slli.d t0, a4, 1 |
|
add.d t1, t0, a4 |
|
slli.d t2, t0, 1 |
|
addi.d t3, a0, 0 |
|
slli.d t4, a3, 1 |
|
add.d t5, t4, a3 |
|
slli.d t6, t4, 1 |
|
.rept 2 |
|
vld vr0, a1, 0 |
|
vldx vr1, a1, a4 |
|
vldx vr2, a1, t0 |
|
vldx vr3, a1, t1 |
|
add.d a1, a1, t2 |
|
vilvl.d vr0, vr1, vr0 |
|
vilvl.d vr2, vr3, vr2 |
|
vld vr8, a2, 0x00 |
|
vld vr9, a2, 0x08 |
|
vld vr10, a2, 0x10 |
|
vld vr11, a2, 0x18 |
|
addi.d a2, a2, 0x20 |
|
vilvl.d vr8, vr9, vr8 |
|
vilvl.d vr10, vr11, vr10 |
|
vavgr.bu vr0, vr8, vr0 |
|
vavgr.bu vr2, vr10, vr2 |
|
vld vr8, t3, 0 |
|
vldx vr9, t3, a3 |
|
vldx vr10, t3, t4 |
|
vldx vr11, t3, t5 |
|
add.d t3, t3, t6 |
|
vilvl.d vr8, vr9, vr8 |
|
vilvl.d vr10, vr11, vr10 |
|
vavgr.bu vr0, vr8, vr0 |
|
vavgr.bu vr2, vr10, vr2 |
|
vstelm.d vr0, a0, 0, 0 |
|
add.d a0, a0, a3 |
|
vstelm.d vr0, a0, 0, 1 |
|
add.d a0, a0, a3 |
|
vstelm.d vr2, a0, 0, 0 |
|
add.d a0, a0, a3 |
|
vstelm.d vr2, a0, 0, 1 |
|
add.d a0, a0, a3 |
|
.endr |
|
endfunc |
|
|
|
function avg_h264_qpel8_h_lowpass_lsx |
|
slli.d t1, a3, 1 |
|
add.d t2, t1, a3 |
|
slli.d t5, a2, 1 |
|
add.d t6, t5, a2 |
|
vldi vr20, 0x414 |
|
vldi vr21, 0x405 |
|
vldi vr22, 0x410 |
|
addi.d t0, a1, -2 // t0 = src - 2 |
|
add.d t3, a1, zero // t3 = src |
|
addi.d t4, a0, 0 // t4 = dst |
|
.rept 4 |
|
vld vr0, t0, 0 |
|
vldx vr1, t0, a3 |
|
QPEL8_H_LSX vr12, vr13 |
|
vssrani.bu.h vr13, vr12, 5 |
|
fld.d f0, t4, 0 |
|
fldx.d f1, t4, a2 |
|
vilvl.d vr0, vr1, vr0 |
|
vavgr.bu vr13, vr13, vr0 |
|
vstelm.d vr13, a0, 0, 0 |
|
add.d a0, a0, a2 |
|
vstelm.d vr13, a0, 0, 1 |
|
add.d a0, a0, a2 |
|
add.d t0, t0, t1 |
|
add.d t4, t4, t1 |
|
.endr |
|
endfunc
|
|
|