mirror of https://github.com/FFmpeg/FFmpeg.git
./configure --disable-lasx ffmpeg -i 1_h264_1080p_30fps_3Mbps.mp4 -f rawvideo -y /dev/null -an before: 199fps after: 214fps Reviewed-by: Shiyou Yin <yinshiyou-hf@loongson.cn> Signed-off-by: Michael Niedermayer <michael@niedermayer.cc>pull/389/head
parent
7845b5ecd6
commit
8815a7719e
10 changed files with 1342 additions and 1445 deletions
@ -1,121 +0,0 @@ |
||||
/*
|
||||
* Copyright (c) 2021 Loongson Technology Corporation Limited |
||||
* Contributed by Hao Chen <chenhao@loongson.cn> |
||||
* |
||||
* This file is part of FFmpeg. |
||||
* |
||||
* FFmpeg is free software; you can redistribute it and/or |
||||
* modify it under the terms of the GNU Lesser General Public |
||||
* License as published by the Free Software Foundation; either |
||||
* version 2.1 of the License, or (at your option) any later version. |
||||
* |
||||
* FFmpeg is distributed in the hope that it will be useful, |
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
||||
* Lesser General Public License for more details. |
||||
* |
||||
* You should have received a copy of the GNU Lesser General Public |
||||
* License along with FFmpeg; if not, write to the Free Software |
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
||||
*/ |
||||
|
||||
#include "libavutil/loongarch/loongson_intrinsics.h" |
||||
#include "h264_intrapred_lasx.h" |
||||
|
||||
#define PRED16X16_PLANE \ |
||||
ptrdiff_t stride_1, stride_2, stride_3, stride_4, stride_5, stride_6; \
|
||||
ptrdiff_t stride_8, stride_15; \
|
||||
int32_t res0, res1, res2, res3, cnt; \
|
||||
uint8_t *src0, *src1; \
|
||||
__m256i reg0, reg1, reg2, reg3, reg4; \
|
||||
__m256i tmp0, tmp1, tmp2, tmp3; \
|
||||
__m256i shuff = {0x0B040A0509060807, 0x0F000E010D020C03, 0, 0}; \
|
||||
__m256i mult = {0x0004000300020001, 0x0008000700060005, 0, 0}; \
|
||||
__m256i int_mult1 = {0x0000000100000000, 0x0000000300000002, \
|
||||
0x0000000500000004, 0x0000000700000006}; \
|
||||
\
|
||||
stride_1 = -stride; \
|
||||
stride_2 = stride << 1; \
|
||||
stride_3 = stride_2 + stride; \
|
||||
stride_4 = stride_2 << 1; \
|
||||
stride_5 = stride_4 + stride; \
|
||||
stride_6 = stride_3 << 1; \
|
||||
stride_8 = stride_4 << 1; \
|
||||
stride_15 = (stride_8 << 1) - stride; \
|
||||
src0 = src - 1; \
|
||||
src1 = src0 + stride_8; \
|
||||
\
|
||||
reg0 = __lasx_xvldx(src0, -stride); \
|
||||
reg1 = __lasx_xvldx(src, (8 - stride)); \
|
||||
reg0 = __lasx_xvilvl_d(reg1, reg0); \
|
||||
reg0 = __lasx_xvshuf_b(reg0, reg0, shuff); \
|
||||
reg0 = __lasx_xvhsubw_hu_bu(reg0, reg0); \
|
||||
reg0 = __lasx_xvmul_h(reg0, mult); \
|
||||
res1 = (src1[0] - src0[stride_6]) + \
|
||||
2 * (src1[stride] - src0[stride_5]) + \
|
||||
3 * (src1[stride_2] - src0[stride_4]) + \
|
||||
4 * (src1[stride_3] - src0[stride_3]) + \
|
||||
5 * (src1[stride_4] - src0[stride_2]) + \
|
||||
6 * (src1[stride_5] - src0[stride]) + \
|
||||
7 * (src1[stride_6] - src0[0]) + \
|
||||
8 * (src0[stride_15] - src0[stride_1]); \
|
||||
reg0 = __lasx_xvhaddw_w_h(reg0, reg0); \
|
||||
reg0 = __lasx_xvhaddw_d_w(reg0, reg0); \
|
||||
reg0 = __lasx_xvhaddw_q_d(reg0, reg0); \
|
||||
res0 = __lasx_xvpickve2gr_w(reg0, 0); \
|
||||
|
||||
#define PRED16X16_PLANE_END \ |
||||
res2 = (src0[stride_15] + src[15 - stride] + 1) << 4; \
|
||||
res3 = 7 * (res0 + res1); \
|
||||
res2 -= res3; \
|
||||
reg0 = __lasx_xvreplgr2vr_w(res0); \
|
||||
reg1 = __lasx_xvreplgr2vr_w(res1); \
|
||||
reg2 = __lasx_xvreplgr2vr_w(res2); \
|
||||
reg3 = __lasx_xvmul_w(reg0, int_mult1); \
|
||||
reg4 = __lasx_xvslli_w(reg0, 3); \
|
||||
reg4 = __lasx_xvadd_w(reg4, reg3); \
|
||||
for (cnt = 8; cnt--;) { \
|
||||
tmp0 = __lasx_xvadd_w(reg2, reg3); \
|
||||
tmp1 = __lasx_xvadd_w(reg2, reg4); \
|
||||
tmp0 = __lasx_xvssrani_hu_w(tmp1, tmp0, 5); \
|
||||
tmp0 = __lasx_xvpermi_d(tmp0, 0xD8); \
|
||||
reg2 = __lasx_xvadd_w(reg2, reg1); \
|
||||
tmp2 = __lasx_xvadd_w(reg2, reg3); \
|
||||
tmp3 = __lasx_xvadd_w(reg2, reg4); \
|
||||
tmp1 = __lasx_xvssrani_hu_w(tmp3, tmp2, 5); \
|
||||
tmp1 = __lasx_xvpermi_d(tmp1, 0xD8); \
|
||||
tmp0 = __lasx_xvssrani_bu_h(tmp1, tmp0, 0); \
|
||||
reg2 = __lasx_xvadd_w(reg2, reg1); \
|
||||
__lasx_xvstelm_d(tmp0, src, 0, 0); \
|
||||
__lasx_xvstelm_d(tmp0, src, 8, 2); \
|
||||
src += stride; \
|
||||
__lasx_xvstelm_d(tmp0, src, 0, 1); \
|
||||
__lasx_xvstelm_d(tmp0, src, 8, 3); \
|
||||
src += stride; \
|
||||
} |
||||
|
||||
|
||||
void ff_h264_pred16x16_plane_h264_8_lasx(uint8_t *src, ptrdiff_t stride) |
||||
{ |
||||
PRED16X16_PLANE |
||||
res0 = (5 * res0 + 32) >> 6; |
||||
res1 = (5 * res1 + 32) >> 6; |
||||
PRED16X16_PLANE_END |
||||
} |
||||
|
||||
void ff_h264_pred16x16_plane_rv40_8_lasx(uint8_t *src, ptrdiff_t stride) |
||||
{ |
||||
PRED16X16_PLANE |
||||
res0 = (res0 + (res0 >> 2)) >> 4; |
||||
res1 = (res1 + (res1 >> 2)) >> 4; |
||||
PRED16X16_PLANE_END |
||||
} |
||||
|
||||
void ff_h264_pred16x16_plane_svq3_8_lasx(uint8_t *src, ptrdiff_t stride) |
||||
{ |
||||
PRED16X16_PLANE |
||||
cnt = (5 * (res0/4)) / 16; |
||||
res0 = (5 * (res1/4)) / 16; |
||||
res1 = cnt; |
||||
PRED16X16_PLANE_END |
||||
} |
@ -0,0 +1,966 @@ |
||||
/* |
||||
* Loongson LSX/LASX optimized h264chroma |
||||
* |
||||
* Copyright (c) 2023 Loongson Technology Corporation Limited |
||||
* Contributed by Lu Wang <wanglu@loongson.cn>
|
||||
* |
||||
* This file is part of FFmpeg. |
||||
* |
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public |
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version. |
||||
* |
||||
* FFmpeg is distributed in the hope that it will be useful, |
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
||||
* Lesser General Public License for more details. |
||||
* |
||||
* You should have received a copy of the GNU Lesser General Public |
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
||||
*/ |
||||
|
||||
#include "loongson_asm.S" |
||||
|
||||
/* void ff_put_h264_chroma_mc8_lsx(uint8_t *dst, uint8_t *src, ptrdiff_t stride, |
||||
int h, int x, int y) */ |
||||
function ff_put_h264_chroma_mc8_lsx |
||||
li.d t8, 8 |
||||
sub.d t1, t8, a4 // 8-x |
||||
sub.d t2, t8, a5 // 8-y |
||||
mul.d t3, t1, t2 // A |
||||
mul.d t4, a4, t2 // B |
||||
mul.d t5, t1, a5 // C |
||||
mul.d t6, a4, a5 // D |
||||
add.d t0, t4, t5 // E |
||||
vreplgr2vr.b vr0, t3 |
||||
vreplgr2vr.b vr1, t4 |
||||
vreplgr2vr.b vr2, t5 |
||||
vreplgr2vr.b vr3, t6 |
||||
vreplgr2vr.b vr4, t0 |
||||
slli.d t2, a2, 1 |
||||
add.d t3, t2, a2 |
||||
slli.d t4, a2, 2 |
||||
|
||||
bge zero, t6, .ENDLOOP_D |
||||
move t1, a3 |
||||
vilvl.b vr9, vr1, vr0 |
||||
vilvl.b vr10, vr3, vr2 |
||||
.LOOP_D: |
||||
vld vr5, a1, 0 |
||||
vld vr6, a1, 1 |
||||
add.d a1, a1, a2 |
||||
vld vr7, a1, 0 |
||||
vld vr8, a1, 1 |
||||
vilvl.b vr11, vr6, vr5 |
||||
vilvl.b vr12, vr8, vr7 |
||||
vmulwev.h.bu vr13, vr9, vr11 |
||||
vmaddwod.h.bu vr13, vr9, vr11 |
||||
vmulwev.h.bu vr14, vr10, vr12 |
||||
vmaddwod.h.bu vr14, vr10, vr12 |
||||
vadd.h vr13, vr13, vr14 |
||||
vsrarni.b.h vr13, vr13, 6 |
||||
vstelm.d vr13, a0, 0, 0 |
||||
add.d a0, a0, a2 |
||||
add.d a1, a1, a2 |
||||
vld vr5, a1, 0 |
||||
vld vr6, a1, 1 |
||||
vilvl.b vr11, vr8, vr7 |
||||
vilvl.b vr12, vr6, vr5 |
||||
vmulwev.h.bu vr13, vr9, vr11 |
||||
vmaddwod.h.bu vr13, vr9, vr11 |
||||
vmulwev.h.bu vr14, vr10, vr12 |
||||
vmaddwod.h.bu vr14, vr10, vr12 |
||||
vadd.h vr13, vr13, vr14 |
||||
vsrarni.b.h vr13, vr13, 6 |
||||
vstelm.d vr13, a0, 0, 0 |
||||
add.d a0, a0, a2 |
||||
add.d a1, a1, a2 |
||||
vld vr7, a1, 0 |
||||
vld vr8, a1, 1 |
||||
vilvl.b vr11, vr6, vr5 |
||||
vilvl.b vr12, vr8, vr7 |
||||
vmulwev.h.bu vr13, vr9, vr11 |
||||
vmaddwod.h.bu vr13, vr9, vr11 |
||||
vmulwev.h.bu vr14, vr10, vr12 |
||||
vmaddwod.h.bu vr14, vr10, vr12 |
||||
vadd.h vr13, vr13, vr14 |
||||
vsrarni.b.h vr13, vr13, 6 |
||||
vstelm.d vr13, a0, 0, 0 |
||||
add.d a0, a0, a2 |
||||
add.d a1, a1, a2 |
||||
vld vr5, a1, 0 |
||||
vld vr6, a1, 1 |
||||
vilvl.b vr11, vr8, vr7 |
||||
vilvl.b vr12, vr6, vr5 |
||||
vmulwev.h.bu vr13, vr9, vr11 |
||||
vmaddwod.h.bu vr13, vr9, vr11 |
||||
vmulwev.h.bu vr14, vr10, vr12 |
||||
vmaddwod.h.bu vr14, vr10, vr12 |
||||
vadd.h vr13, vr13, vr14 |
||||
vsrarni.b.h vr13, vr13, 6 |
||||
vstelm.d vr13, a0, 0, 0 |
||||
add.d a0, a0, a2 |
||||
|
||||
addi.d t1, t1, -4 |
||||
blt zero, t1, .LOOP_D |
||||
b .ENDLOOP |
||||
.ENDLOOP_D: |
||||
|
||||
bge zero, t0, .ENDLOOP_E |
||||
move t1, a3 |
||||
li.d t7, 1 |
||||
slt t8, zero, t5 |
||||
maskeqz t5, a2, t8 |
||||
masknez t7, t7, t8 |
||||
or t7, t7, t5 |
||||
vilvl.b vr7, vr4, vr0 |
||||
.LOOP_E: |
||||
vld vr5, a1, 0 |
||||
vldx vr6, a1, t7 |
||||
vilvl.b vr5, vr6, vr5 |
||||
vmulwev.h.bu vr6, vr7, vr5 |
||||
vmaddwod.h.bu vr6, vr7, vr5 |
||||
vsrarni.b.h vr6, vr6, 6 |
||||
vstelm.d vr6, a0, 0, 0 |
||||
add.d a0, a0, a2 |
||||
add.d a1, a1, a2 |
||||
vld vr5, a1, 0 |
||||
vldx vr6, a1, t7 |
||||
vilvl.b vr5, vr6, vr5 |
||||
vmulwev.h.bu vr6, vr7, vr5 |
||||
vmaddwod.h.bu vr6, vr7, vr5 |
||||
vsrarni.b.h vr6, vr6, 6 |
||||
vstelm.d vr6, a0, 0, 0 |
||||
add.d a0, a0, a2 |
||||
add.d a1, a1, a2 |
||||
vld vr5, a1, 0 |
||||
vldx vr6, a1, t7 |
||||
vilvl.b vr5, vr6, vr5 |
||||
vmulwev.h.bu vr6, vr7, vr5 |
||||
vmaddwod.h.bu vr6, vr7, vr5 |
||||
vsrarni.b.h vr6, vr6, 6 |
||||
vstelm.d vr6, a0, 0, 0 |
||||
add.d a0, a0, a2 |
||||
add.d a1, a1, a2 |
||||
vld vr5, a1, 0 |
||||
vldx vr6, a1, t7 |
||||
vilvl.b vr5, vr6, vr5 |
||||
vmulwev.h.bu vr6, vr7, vr5 |
||||
vmaddwod.h.bu vr6, vr7, vr5 |
||||
vsrarni.b.h vr6, vr6, 6 |
||||
vstelm.d vr6, a0, 0, 0 |
||||
add.d a0, a0, a2 |
||||
add.d a1, a1, a2 |
||||
|
||||
addi.d t1, t1, -4 |
||||
blt zero, t1, .LOOP_E |
||||
b .ENDLOOP |
||||
.ENDLOOP_E: |
||||
|
||||
move t1, a3 |
||||
.LOOP: |
||||
vld vr5, a1, 0 |
||||
vmulwev.h.bu vr6, vr0, vr5 |
||||
vmulwod.h.bu vr7, vr0, vr5 |
||||
vsrarni.b.h vr6, vr6, 6 |
||||
vsrarni.b.h vr7, vr7, 6 |
||||
vilvl.b vr6, vr7, vr6 |
||||
vstelm.d vr6, a0, 0, 0 |
||||
add.d a0, a0, a2 |
||||
vldx vr5, a1, a2 |
||||
vmulwev.h.bu vr6, vr0, vr5 |
||||
vmulwod.h.bu vr7, vr0, vr5 |
||||
vsrarni.b.h vr6, vr6, 6 |
||||
vsrarni.b.h vr7, vr7, 6 |
||||
vilvl.b vr6, vr7, vr6 |
||||
vstelm.d vr6, a0, 0, 0 |
||||
add.d a0, a0, a2 |
||||
vldx vr5, a1, t2 |
||||
vmulwev.h.bu vr6, vr0, vr5 |
||||
vmulwod.h.bu vr7, vr0, vr5 |
||||
vsrarni.b.h vr6, vr6, 6 |
||||
vsrarni.b.h vr7, vr7, 6 |
||||
vilvl.b vr6, vr7, vr6 |
||||
vstelm.d vr6, a0, 0, 0 |
||||
add.d a0, a0, a2 |
||||
vldx vr5, a1, t3 |
||||
vmulwev.h.bu vr6, vr0, vr5 |
||||
vmulwod.h.bu vr7, vr0, vr5 |
||||
vsrarni.b.h vr6, vr6, 6 |
||||
vsrarni.b.h vr7, vr7, 6 |
||||
vilvl.b vr6, vr7, vr6 |
||||
vstelm.d vr6, a0, 0, 0 |
||||
add.d a0, a0, a2 |
||||
add.d a1, a1, t4 |
||||
|
||||
addi.d t1, t1, -4 |
||||
blt zero, t1, .LOOP |
||||
.ENDLOOP: |
||||
endfunc |
||||
|
||||
/* void ff_avg_h264_chroma_mc8_lsx(uint8_t *dst, uint8_t *src, ptrdiff_t stride, |
||||
int h, int x, int y) */ |
||||
function ff_avg_h264_chroma_mc8_lsx |
||||
li.d t8, 8 |
||||
sub.d t1, t8, a4 // 8-x |
||||
sub.d t2, t8, a5 // 8-y |
||||
mul.d t3, t1, t2 // A |
||||
mul.d t4, a4, t2 // B |
||||
mul.d t5, t1, a5 // C |
||||
mul.d t6, a4, a5 // D |
||||
add.d t0, t4, t5 // E |
||||
vreplgr2vr.b vr0, t3 |
||||
vreplgr2vr.b vr1, t4 |
||||
vreplgr2vr.b vr2, t5 |
||||
vreplgr2vr.b vr3, t6 |
||||
vreplgr2vr.b vr4, t0 |
||||
slli.d t2, a2, 1 |
||||
add.d t3, t2, a2 |
||||
slli.d t4, a2, 2 |
||||
|
||||
bge zero, t6, .ENDLOOPD |
||||
move t1, a3 |
||||
vilvl.b vr9, vr1, vr0 |
||||
vilvl.b vr10, vr3, vr2 |
||||
.LOOPD: |
||||
vld vr5, a1, 0 |
||||
vld vr6, a1, 1 |
||||
add.d a1, a1, a2 |
||||
vld vr7, a1, 0 |
||||
vld vr8, a1, 1 |
||||
vld vr11, a0, 0 |
||||
vilvl.b vr12, vr6, vr5 |
||||
vilvl.b vr13, vr8, vr7 |
||||
vmulwev.h.bu vr14, vr9, vr12 |
||||
vmaddwod.h.bu vr14, vr9, vr12 |
||||
vmulwev.h.bu vr15, vr10, vr13 |
||||
vmaddwod.h.bu vr15, vr10, vr13 |
||||
vadd.h vr14, vr14, vr15 |
||||
vsrari.h vr14, vr14, 6 |
||||
vsllwil.hu.bu vr11, vr11, 0 |
||||
vadd.h vr11, vr14, vr11 |
||||
vsrarni.b.h vr11, vr11, 1 |
||||
vstelm.d vr11, a0, 0, 0 |
||||
add.d a0, a0, a2 |
||||
add.d a1, a1, a2 |
||||
vld vr5, a1, 0 |
||||
vld vr6, a1, 1 |
||||
vld vr11, a0, 0 |
||||
vilvl.b vr12, vr8, vr7 |
||||
vilvl.b vr13, vr6, vr5 |
||||
vmulwev.h.bu vr14, vr9, vr12 |
||||
vmaddwod.h.bu vr14, vr9, vr12 |
||||
vmulwev.h.bu vr15, vr10, vr13 |
||||
vmaddwod.h.bu vr15, vr10, vr13 |
||||
vadd.h vr14, vr14, vr15 |
||||
vsrari.h vr14, vr14, 6 |
||||
vsllwil.hu.bu vr11, vr11, 0 |
||||
vadd.h vr11, vr14, vr11 |
||||
vsrarni.b.h vr11, vr11, 1 |
||||
vstelm.d vr11, a0, 0, 0 |
||||
add.d a0, a0, a2 |
||||
add.d a1, a1, a2 |
||||
vld vr7, a1, 0 |
||||
vld vr8, a1, 1 |
||||
vld vr11, a0, 0 |
||||
vilvl.b vr12, vr6, vr5 |
||||
vilvl.b vr13, vr8, vr7 |
||||
vmulwev.h.bu vr14, vr9, vr12 |
||||
vmaddwod.h.bu vr14, vr9, vr12 |
||||
vmulwev.h.bu vr15, vr10, vr13 |
||||
vmaddwod.h.bu vr15, vr10, vr13 |
||||
vadd.h vr14, vr14, vr15 |
||||
vsrari.h vr14, vr14, 6 |
||||
vsllwil.hu.bu vr11, vr11, 0 |
||||
vadd.h vr11, vr14, vr11 |
||||
vsrarni.b.h vr11, vr11, 1 |
||||
vstelm.d vr11, a0, 0, 0 |
||||
add.d a0, a0, a2 |
||||
add.d a1, a1, a2 |
||||
vld vr5, a1, 0 |
||||
vld vr6, a1, 1 |
||||
vld vr11, a0, 0 |
||||
vilvl.b vr12, vr8, vr7 |
||||
vilvl.b vr13, vr6, vr5 |
||||
vmulwev.h.bu vr14, vr9, vr12 |
||||
vmaddwod.h.bu vr14, vr9, vr12 |
||||
vmulwev.h.bu vr15, vr10, vr13 |
||||
vmaddwod.h.bu vr15, vr10, vr13 |
||||
vadd.h vr14, vr14, vr15 |
||||
vsrari.h vr14, vr14, 6 |
||||
vsllwil.hu.bu vr11, vr11, 0 |
||||
vadd.h vr11, vr14, vr11 |
||||
vsrarni.b.h vr11, vr11, 1 |
||||
vstelm.d vr11, a0, 0, 0 |
||||
add.d a0, a0, a2 |
||||
|
||||
addi.d t1, t1, -4 |
||||
blt zero, t1, .LOOPD |
||||
b .ENDLOOPELSE |
||||
.ENDLOOPD: |
||||
|
||||
bge zero, t0, .ENDLOOPE |
||||
move t1, a3 |
||||
li.d t7, 1 |
||||
slt t8, zero, t5 |
||||
maskeqz t5, a2, t8 |
||||
masknez t7, t7, t8 |
||||
or t7, t7, t5 |
||||
vilvl.b vr7, vr4, vr0 |
||||
.LOOPE: |
||||
vld vr5, a1, 0 |
||||
vldx vr6, a1, t7 |
||||
vld vr8, a0, 0 |
||||
vilvl.b vr5, vr6, vr5 |
||||
vmulwev.h.bu vr6, vr7, vr5 |
||||
vmaddwod.h.bu vr6, vr7, vr5 |
||||
vsrari.h vr6, vr6, 6 |
||||
vsllwil.hu.bu vr8, vr8, 0 |
||||
vadd.h vr8, vr6, vr8 |
||||
vsrarni.b.h vr8, vr8, 1 |
||||
vstelm.d vr8, a0, 0, 0 |
||||
add.d a0, a0, a2 |
||||
add.d a1, a1, a2 |
||||
vld vr5, a1, 0 |
||||
vldx vr6, a1, t7 |
||||
vld vr8, a0, 0 |
||||
vilvl.b vr5, vr6, vr5 |
||||
vmulwev.h.bu vr6, vr7, vr5 |
||||
vmaddwod.h.bu vr6, vr7, vr5 |
||||
vsrari.h vr6, vr6, 6 |
||||
vsllwil.hu.bu vr8, vr8, 0 |
||||
vadd.h vr8, vr6, vr8 |
||||
vsrarni.b.h vr8, vr8, 1 |
||||
vstelm.d vr8, a0, 0, 0 |
||||
add.d a0, a0, a2 |
||||
add.d a1, a1, a2 |
||||
vld vr5, a1, 0 |
||||
vldx vr6, a1, t7 |
||||
vld vr8, a0, 0 |
||||
vilvl.b vr5, vr6, vr5 |
||||
vmulwev.h.bu vr6, vr7, vr5 |
||||
vmaddwod.h.bu vr6, vr7, vr5 |
||||
vsrari.h vr6, vr6, 6 |
||||
vsllwil.hu.bu vr8, vr8, 0 |
||||
vadd.h vr8, vr6, vr8 |
||||
vsrarni.b.h vr8, vr8, 1 |
||||
vstelm.d vr8, a0, 0, 0 |
||||
add.d a0, a0, a2 |
||||
add.d a1, a1, a2 |
||||
vld vr5, a1, 0 |
||||
vldx vr6, a1, t7 |
||||
vld vr8, a0, 0 |
||||
vilvl.b vr5, vr6, vr5 |
||||
vmulwev.h.bu vr6, vr7, vr5 |
||||
vmaddwod.h.bu vr6, vr7, vr5 |
||||
vsrari.h vr6, vr6, 6 |
||||
vsllwil.hu.bu vr8, vr8, 0 |
||||
vadd.h vr8, vr6, vr8 |
||||
vsrarni.b.h vr8, vr8, 1 |
||||
vstelm.d vr8, a0, 0, 0 |
||||
add.d a0, a0, a2 |
||||
add.d a1, a1, a2 |
||||
|
||||
addi.d t1, t1, -4 |
||||
blt zero, t1, .LOOPE |
||||
b .ENDLOOPELSE |
||||
.ENDLOOPE: |
||||
|
||||
move t1, a3 |
||||
.LOOPELSE: |
||||
vld vr5, a1, 0 |
||||
vld vr8, a0, 0 |
||||
vmulwev.h.bu vr6, vr0, vr5 |
||||
vmulwod.h.bu vr7, vr0, vr5 |
||||
vilvl.h vr6, vr7, vr6 |
||||
vsrari.h vr6, vr6, 6 |
||||
vsllwil.hu.bu vr8, vr8, 0 |
||||
vadd.h vr8, vr6, vr8 |
||||
vsrarni.b.h vr8, vr8, 1 |
||||
vstelm.d vr8, a0, 0, 0 |
||||
add.d a0, a0, a2 |
||||
vldx vr5, a1, a2 |
||||
vld vr8, a0, 0 |
||||
vmulwev.h.bu vr6, vr0, vr5 |
||||
vmulwod.h.bu vr7, vr0, vr5 |
||||
vilvl.h vr6, vr7, vr6 |
||||
vsrari.h vr6, vr6, 6 |
||||
vsllwil.hu.bu vr8, vr8, 0 |
||||
vadd.h vr8, vr6, vr8 |
||||
vsrarni.b.h vr8, vr8, 1 |
||||
vstelm.d vr8, a0, 0, 0 |
||||
add.d a0, a0, a2 |
||||
vldx vr5, a1, t2 |
||||
vld vr8, a0, 0 |
||||
vmulwev.h.bu vr6, vr0, vr5 |
||||
vmulwod.h.bu vr7, vr0, vr5 |
||||
vilvl.h vr6, vr7, vr6 |
||||
vsrari.h vr6, vr6, 6 |
||||
vsllwil.hu.bu vr8, vr8, 0 |
||||
vadd.h vr8, vr6, vr8 |
||||
vsrarni.b.h vr8, vr8, 1 |
||||
vstelm.d vr8, a0, 0, 0 |
||||
add.d a0, a0, a2 |
||||
vldx vr5, a1, t3 |
||||
vld vr8, a0, 0 |
||||
vmulwev.h.bu vr6, vr0, vr5 |
||||
vmulwod.h.bu vr7, vr0, vr5 |
||||
vilvl.h vr6, vr7, vr6 |
||||
vsrari.h vr6, vr6, 6 |
||||
vsllwil.hu.bu vr8, vr8, 0 |
||||
vadd.h vr8, vr6, vr8 |
||||
vsrarni.b.h vr8, vr8, 1 |
||||
vstelm.d vr8, a0, 0, 0 |
||||
add.d a0, a0, a2 |
||||
add.d a1, a1, t4 |
||||
|
||||
addi.d t1, t1, -4 |
||||
blt zero, t1, .LOOPELSE |
||||
.ENDLOOPELSE: |
||||
endfunc |
||||
|
||||
/* void ff_put_h264_chroma_mc4_lsx(uint8_t *dst, uint8_t *src, ptrdiff_t stride, |
||||
int h, int x, int y) */ |
||||
function ff_put_h264_chroma_mc4_lsx |
||||
li.d t8, 8 |
||||
sub.d t1, t8, a4 // 8-x |
||||
sub.d t2, t8, a5 // 8-y |
||||
mul.d t3, t1, t2 // A |
||||
mul.d t4, a4, t2 // B |
||||
mul.d t5, t1, a5 // C |
||||
mul.d t6, a4, a5 // D |
||||
add.d t0, t4, t5 // E |
||||
slli.d t8, a2, 1 |
||||
vreplgr2vr.b vr0, t3 |
||||
vreplgr2vr.b vr1, t4 |
||||
vreplgr2vr.b vr2, t5 |
||||
vreplgr2vr.b vr3, t6 |
||||
vreplgr2vr.b vr4, t0 |
||||
|
||||
bge zero, t6, .ENDPUT_D |
||||
move t1, a3 |
||||
vilvl.b vr9, vr1, vr0 |
||||
vilvl.b vr10, vr3, vr2 |
||||
.PUT_D: |
||||
vld vr5, a1, 0 |
||||
vld vr6, a1, 1 |
||||
add.d a1, a1, a2 |
||||
vld vr7, a1, 0 |
||||
vld vr8, a1, 1 |
||||
add.d a1, a1, a2 |
||||
vld vr11, a1, 0 |
||||
vld vr12, a1, 1 |
||||
vilvl.b vr5, vr6, vr5 |
||||
vilvl.b vr7, vr8, vr7 |
||||
vilvl.b vr13, vr12, vr11 |
||||
vilvl.d vr5, vr7, vr5 |
||||
vilvl.d vr13, vr13, vr7 |
||||
vmulwev.h.bu vr14, vr9, vr5 |
||||
vmaddwod.h.bu vr14, vr9, vr5 |
||||
vmulwev.h.bu vr15, vr10, vr13 |
||||
vmaddwod.h.bu vr15, vr10, vr13 |
||||
vadd.h vr14, vr14, vr15 |
||||
vsrarni.b.h vr14, vr14, 6 |
||||
vstelm.w vr14, a0, 0, 0 |
||||
add.d a0, a0, a2 |
||||
vstelm.w vr14, a0, 0, 1 |
||||
add.d a0, a0, a2 |
||||
addi.d t1, t1, -2 |
||||
blt zero, t1, .PUT_D |
||||
b .ENDPUT |
||||
.ENDPUT_D: |
||||
|
||||
bge zero, t0, .ENDPUT_E |
||||
move t1, a3 |
||||
li.d t7, 1 |
||||
slt t8, zero, t5 |
||||
maskeqz t5, a2, t8 |
||||
masknez t7, t7, t8 |
||||
or t7, t7, t5 |
||||
vilvl.b vr7, vr4, vr0 |
||||
.PUT_E: |
||||
vld vr5, a1, 0 |
||||
vldx vr6, a1, t7 |
||||
vilvl.b vr5, vr6, vr5 |
||||
add.d a1, a1, a2 |
||||
vld vr8, a1, 0 |
||||
vldx vr9, a1, t7 |
||||
vilvl.b vr8, vr9, vr8 |
||||
vilvl.d vr5, vr8, vr5 |
||||
vmulwev.h.bu vr6, vr7, vr5 |
||||
vmaddwod.h.bu vr6, vr7, vr5 |
||||
vsrarni.b.h vr6, vr6, 6 |
||||
vstelm.w vr6, a0, 0, 0 |
||||
add.d a0, a0, a2 |
||||
vstelm.w vr6, a0, 0, 1 |
||||
add.d a0, a0, a2 |
||||
add.d a1, a1, a2 |
||||
addi.d t1, t1, -2 |
||||
blt zero, t1, .PUT_E |
||||
b .ENDPUT |
||||
.ENDPUT_E: |
||||
|
||||
move t1, a3 |
||||
.PUT: |
||||
vld vr5, a1, 0 |
||||
vldx vr8, a1, a2 |
||||
vilvl.w vr5, vr8, vr5 |
||||
vmulwev.h.bu vr6, vr0, vr5 |
||||
vmulwod.h.bu vr7, vr0, vr5 |
||||
vsrarni.b.h vr6, vr6, 6 |
||||
vsrarni.b.h vr7, vr7, 6 |
||||
vilvl.b vr6, vr7, vr6 |
||||
vstelm.w vr6, a0, 0, 0 |
||||
add.d a0, a0, a2 |
||||
vstelm.w vr6, a0, 0, 1 |
||||
add.d a0, a0, a2 |
||||
add.d a1, a1, t8 |
||||
addi.d t1, t1, -2 |
||||
blt zero, t1, .PUT |
||||
.ENDPUT: |
||||
endfunc |
||||
|
||||
/* void ff_put_h264_chroma_mc8_lasx(uint8_t *dst, uint8_t *src, ptrdiff_t stride, |
||||
int h, int x, int y) */ |
||||
function ff_put_h264_chroma_mc8_lasx |
||||
li.d t8, 8 |
||||
sub.d t1, t8, a4 // 8-x |
||||
sub.d t2, t8, a5 // 8-y |
||||
mul.d t3, t1, t2 // A |
||||
mul.d t4, a4, t2 // B |
||||
mul.d t5, t1, a5 // C |
||||
mul.d t6, a4, a5 // D |
||||
add.d t0, t4, t5 // E |
||||
xvreplgr2vr.b xr0, t3 |
||||
xvreplgr2vr.b xr1, t4 |
||||
xvreplgr2vr.b xr2, t5 |
||||
xvreplgr2vr.b xr3, t6 |
||||
xvreplgr2vr.b xr4, t0 |
||||
slli.d t2, a2, 1 |
||||
add.d t3, t2, a2 |
||||
slli.d t4, a2, 2 |
||||
|
||||
bge zero, t6, .ENDLOOP_DA |
||||
move t1, a3 |
||||
xvilvl.b xr9, xr1, xr0 |
||||
xvilvl.b xr10, xr3, xr2 |
||||
.LOOP_DA: |
||||
fld.d f5, a1, 0 |
||||
fld.d f6, a1, 1 |
||||
add.d a1, a1, a2 |
||||
fld.d f7, a1, 0 |
||||
fld.d f8, a1, 1 |
||||
add.d a1, a1, a2 |
||||
fld.d f13, a1, 0 |
||||
fld.d f14, a1, 1 |
||||
add.d a1, a1, a2 |
||||
fld.d f15, a1, 0 |
||||
fld.d f16, a1, 1 |
||||
add.d a1, a1, a2 |
||||
fld.d f17, a1, 0 |
||||
fld.d f18, a1, 1 |
||||
vilvl.b vr11, vr6, vr5 |
||||
vilvl.b vr12, vr8, vr7 |
||||
vilvl.b vr14, vr14, vr13 |
||||
vilvl.b vr15, vr16, vr15 |
||||
vilvl.b vr16, vr18, vr17 |
||||
xvpermi.q xr11, xr12, 0x02 |
||||
xvpermi.q xr12, xr14, 0x02 |
||||
xvpermi.q xr14, xr15, 0x02 |
||||
xvpermi.q xr15, xr16, 0x02 |
||||
|
||||
xvmulwev.h.bu xr19, xr9, xr11 |
||||
xvmaddwod.h.bu xr19, xr9, xr11 |
||||
xvmulwev.h.bu xr20, xr10, xr12 |
||||
xvmaddwod.h.bu xr20, xr10, xr12 |
||||
xvadd.h xr21, xr19, xr20 |
||||
xvsrarni.b.h xr21, xr21, 6 |
||||
vstelm.d vr21, a0, 0, 0 |
||||
add.d a0, a0, a2 |
||||
xvstelm.d xr21, a0, 0, 2 |
||||
add.d a0, a0, a2 |
||||
xvmulwev.h.bu xr13, xr9, xr14 |
||||
xvmaddwod.h.bu xr13, xr9, xr14 |
||||
xvmulwev.h.bu xr14, xr10, xr15 |
||||
xvmaddwod.h.bu xr14, xr10, xr15 |
||||
xvadd.h xr13, xr13, xr14 |
||||
xvsrarni.b.h xr13, xr13, 6 |
||||
vstelm.d vr13, a0, 0, 0 |
||||
add.d a0, a0, a2 |
||||
xvstelm.d xr13, a0, 0, 2 |
||||
add.d a0, a0, a2 |
||||
|
||||
addi.d t1, t1, -4 |
||||
blt zero, t1, .LOOP_DA |
||||
b .ENDLOOPA |
||||
.ENDLOOP_DA: |
||||
|
||||
bge zero, t0, .ENDLOOP_EA |
||||
move t1, a3 |
||||
li.d t7, 1 |
||||
slt t8, zero, t5 |
||||
maskeqz t5, a2, t8 |
||||
masknez t7, t7, t8 |
||||
or t7, t7, t5 |
||||
xvilvl.b xr7, xr4, xr0 |
||||
.LOOP_EA: |
||||
fld.d f5, a1, 0 |
||||
fldx.d f6, a1, t7 |
||||
add.d a1, a1, a2 |
||||
fld.d f9, a1, 0 |
||||
fldx.d f10, a1, t7 |
||||
add.d a1, a1, a2 |
||||
fld.d f11, a1, 0 |
||||
fldx.d f12, a1, t7 |
||||
add.d a1, a1, a2 |
||||
fld.d f13, a1, 0 |
||||
fldx.d f14, a1, t7 |
||||
vilvl.b vr5, vr6, vr5 |
||||
vilvl.b vr9, vr10, vr9 |
||||
vilvl.b vr11, vr12, vr11 |
||||
vilvl.b vr13, vr14, vr13 |
||||
xvpermi.q xr5, xr9, 0x02 |
||||
xvpermi.q xr11, xr13, 0x02 |
||||
|
||||
xvmulwev.h.bu xr8, xr7, xr5 |
||||
xvmaddwod.h.bu xr8, xr7, xr5 |
||||
xvmulwev.h.bu xr6, xr7, xr11 |
||||
xvmaddwod.h.bu xr6, xr7, xr11 |
||||
xvsrarni.b.h xr8, xr8, 6 |
||||
vstelm.d vr8, a0, 0, 0 |
||||
add.d a0, a0, a2 |
||||
xvstelm.d xr8, a0, 0, 2 |
||||
add.d a0, a0, a2 |
||||
xvsrarni.b.h xr6, xr6, 6 |
||||
vstelm.d vr6, a0, 0, 0 |
||||
add.d a0, a0, a2 |
||||
xvstelm.d xr6, a0, 0, 2 |
||||
add.d a0, a0, a2 |
||||
add.d a1, a1, a2 |
||||
|
||||
addi.d t1, t1, -4 |
||||
blt zero, t1, .LOOP_EA |
||||
b .ENDLOOPA |
||||
.ENDLOOP_EA: |
||||
|
||||
move t1, a3 |
||||
.LOOPA: |
||||
fld.d f5, a1, 0 |
||||
fldx.d f6, a1, a2 |
||||
fldx.d f7, a1, t2 |
||||
fldx.d f8, a1, t3 |
||||
vilvl.d vr5, vr6, vr5 |
||||
vilvl.d vr7, vr8, vr7 |
||||
xvpermi.q xr5, xr7, 0x02 |
||||
xvmulwev.h.bu xr6, xr0, xr5 |
||||
xvmulwod.h.bu xr7, xr0, xr5 |
||||
xvilvl.h xr8, xr7, xr6 |
||||
xvilvh.h xr9, xr7, xr6 |
||||
xvsrarni.b.h xr9, xr8, 6 |
||||
vstelm.d vr9, a0, 0, 0 |
||||
add.d a0, a0, a2 |
||||
vstelm.d vr9, a0, 0, 1 |
||||
add.d a0, a0, a2 |
||||
xvstelm.d xr9, a0, 0, 2 |
||||
add.d a0, a0, a2 |
||||
xvstelm.d xr9, a0, 0, 3 |
||||
add.d a0, a0, a2 |
||||
add.d a1, a1, t4 |
||||
|
||||
addi.d t1, t1, -4 |
||||
blt zero, t1, .LOOPA |
||||
.ENDLOOPA: |
||||
endfunc |
||||
|
||||
/* void ff_avg_h264_chroma_mc8_lasx(uint8_t *dst, uint8_t *src, ptrdiff_t stride, |
||||
int h, int x, int y) */ |
||||
function ff_avg_h264_chroma_mc8_lasx |
||||
li.d t8, 8 |
||||
sub.d t1, t8, a4 // 8-x |
||||
sub.d t2, t8, a5 // 8-y |
||||
mul.d t3, t1, t2 // A |
||||
mul.d t4, a4, t2 // B |
||||
mul.d t5, t1, a5 // C |
||||
mul.d t6, a4, a5 // D |
||||
add.d t0, t4, t5 // E |
||||
xvreplgr2vr.b xr0, t3 |
||||
xvreplgr2vr.b xr1, t4 |
||||
xvreplgr2vr.b xr2, t5 |
||||
xvreplgr2vr.b xr3, t6 |
||||
xvreplgr2vr.b xr4, t0 |
||||
slli.d t2, a2, 1 |
||||
add.d t3, t2, a2 |
||||
slli.d t4, a2, 2 |
||||
|
||||
bge zero, t6, .ENDLOOPDA |
||||
move t1, a3 |
||||
xvilvl.b xr9, xr1, xr0 |
||||
xvilvl.b xr10, xr3, xr2 |
||||
.LOOPDA: |
||||
fld.d f5, a1, 0 |
||||
fld.d f6, a1, 1 |
||||
add.d a1, a1, a2 |
||||
fld.d f7, a1, 0 |
||||
fld.d f8, a1, 1 |
||||
add.d a1, a1, a2 |
||||
fld.d f11, a1, 0 |
||||
fld.d f12, a1, 1 |
||||
add.d a1, a1, a2 |
||||
fld.d f13, a1, 0 |
||||
fld.d f14, a1, 1 |
||||
add.d a1, a1, a2 |
||||
fld.d f15, a1, 0 |
||||
fld.d f16, a1, 1 |
||||
fld.d f17, a0, 0 |
||||
fldx.d f18, a0, a2 |
||||
fldx.d f19, a0, t2 |
||||
fldx.d f20, a0, t3 |
||||
vilvl.b vr5, vr6, vr5 |
||||
vilvl.b vr7, vr8, vr7 |
||||
vilvl.b vr11, vr12, vr11 |
||||
vilvl.b vr13, vr14, vr13 |
||||
vilvl.b vr16, vr16, vr15 |
||||
xvpermi.q xr5, xr7, 0x02 |
||||
xvpermi.q xr7, xr11, 0x02 |
||||
xvpermi.q xr11, xr13, 0x02 |
||||
xvpermi.q xr13, xr16, 0x02 |
||||
xvpermi.q xr17, xr18, 0x02 |
||||
xvpermi.q xr19, xr20, 0x02 |
||||
|
||||
xvmulwev.h.bu xr14, xr9, xr5 |
||||
xvmaddwod.h.bu xr14, xr9, xr5 |
||||
xvmulwev.h.bu xr15, xr10, xr7 |
||||
xvmaddwod.h.bu xr15, xr10, xr7 |
||||
xvadd.h xr14, xr14, xr15 |
||||
xvsrari.h xr14, xr14, 6 |
||||
xvsllwil.hu.bu xr17, xr17, 0 |
||||
xvadd.h xr20, xr14, xr17 |
||||
xvsrarni.b.h xr20, xr20, 1 |
||||
xvstelm.d xr20, a0, 0, 0 |
||||
add.d a0, a0, a2 |
||||
xvstelm.d xr20, a0, 0, 2 |
||||
add.d a0, a0, a2 |
||||
xvmulwev.h.bu xr14, xr9, xr11 |
||||
xvmaddwod.h.bu xr14, xr9, xr11 |
||||
xvmulwev.h.bu xr15, xr10, xr13 |
||||
xvmaddwod.h.bu xr15, xr10, xr13 |
||||
xvadd.h xr14, xr14, xr15 |
||||
xvsrari.h xr14, xr14, 6 |
||||
xvsllwil.hu.bu xr19, xr19, 0 |
||||
xvadd.h xr21, xr14, xr19 |
||||
xvsrarni.b.h xr21, xr21, 1 |
||||
xvstelm.d xr21, a0, 0, 0 |
||||
add.d a0, a0, a2 |
||||
xvstelm.d xr21, a0, 0, 2 |
||||
add.d a0, a0, a2 |
||||
|
||||
addi.d t1, t1, -4 |
||||
blt zero, t1, .LOOPDA |
||||
b .ENDLOOPELSEA |
||||
.ENDLOOPDA: |
||||
|
||||
bge zero, t0, .ENDLOOPEA |
||||
move t1, a3 |
||||
li.d t7, 1 |
||||
slt t8, zero, t5 |
||||
maskeqz t5, a2, t8 |
||||
masknez t7, t7, t8 |
||||
or t7, t7, t5 |
||||
xvilvl.b xr7, xr4, xr0 |
||||
.LOOPEA: |
||||
fld.d f5, a1, 0 |
||||
fldx.d f6, a1, t7 |
||||
add.d a1, a1, a2 |
||||
fld.d f8, a1, 0 |
||||
fldx.d f9, a1, t7 |
||||
add.d a1, a1, a2 |
||||
fld.d f10, a1, 0 |
||||
fldx.d f11, a1, t7 |
||||
add.d a1, a1, a2 |
||||
fld.d f12, a1, 0 |
||||
fldx.d f13, a1, t7 |
||||
add.d a1, a1, a2 |
||||
fld.d f14, a0, 0 |
||||
fldx.d f15, a0, a2 |
||||
fldx.d f16, a0, t2 |
||||
fldx.d f17, a0, t3 |
||||
vilvl.b vr5, vr6, vr5 |
||||
vilvl.b vr8, vr9, vr8 |
||||
vilvl.b vr10, vr11, vr10 |
||||
vilvl.b vr12, vr13, vr12 |
||||
xvpermi.q xr5, xr8, 0x02 |
||||
xvpermi.q xr10, xr12, 0x02 |
||||
xvpermi.q xr14, xr15, 0x02 |
||||
xvpermi.q xr16, xr17, 0x02 |
||||
|
||||
xvmulwev.h.bu xr6, xr7, xr5 |
||||
xvmaddwod.h.bu xr6, xr7, xr5 |
||||
xvsrari.h xr6, xr6, 6 |
||||
xvsllwil.hu.bu xr14, xr14, 0 |
||||
xvadd.h xr8, xr6, xr14 |
||||
xvsrarni.b.h xr8, xr8, 1 |
||||
xvstelm.d xr8, a0, 0, 0 |
||||
add.d a0, a0, a2 |
||||
xvstelm.d xr8, a0, 0, 2 |
||||
add.d a0, a0, a2 |
||||
xvmulwev.h.bu xr6, xr7, xr10 |
||||
xvmaddwod.h.bu xr6, xr7, xr10 |
||||
xvsrari.h xr6, xr6, 6 |
||||
xvsllwil.hu.bu xr16, xr16, 0 |
||||
xvadd.h xr8, xr6, xr16 |
||||
xvsrarni.b.h xr8, xr8, 1 |
||||
xvstelm.d xr8, a0, 0, 0 |
||||
add.d a0, a0, a2 |
||||
xvstelm.d xr8, a0, 0, 2 |
||||
add.d a0, a0, a2 |
||||
|
||||
addi.d t1, t1, -4 |
||||
blt zero, t1, .LOOPEA |
||||
b .ENDLOOPELSEA |
||||
.ENDLOOPEA: |
||||
|
||||
move t1, a3 |
||||
.LOOPELSEA: |
||||
fld.d f5, a1, 0 |
||||
fldx.d f6, a1, a2 |
||||
fldx.d f7, a1, t2 |
||||
fldx.d f8, a1, t3 |
||||
fld.d f9, a0, 0 |
||||
fldx.d f10, a0, a2 |
||||
fldx.d f11, a0, t2 |
||||
fldx.d f12, a0, t3 |
||||
xvpermi.q xr5, xr6, 0x02 |
||||
xvpermi.q xr7, xr8, 0x02 |
||||
xvpermi.q xr9, xr10, 0x02 |
||||
xvpermi.q xr11, xr12, 0x02 |
||||
|
||||
xvmulwev.h.bu xr12, xr0, xr5 |
||||
xvmulwod.h.bu xr13, xr0, xr5 |
||||
xvilvl.h xr12, xr13, xr12 |
||||
xvsrari.h xr12, xr12, 6 |
||||
xvsllwil.hu.bu xr9, xr9, 0 |
||||
xvadd.h xr9, xr12, xr9 |
||||
xvsrarni.b.h xr9, xr9, 1 |
||||
xvstelm.d xr9, a0, 0, 0 |
||||
add.d a0, a0, a2 |
||||
xvstelm.d xr9, a0, 0, 2 |
||||
add.d a0, a0, a2 |
||||
xvmulwev.h.bu xr12, xr0, xr7 |
||||
xvmulwod.h.bu xr13, xr0, xr7 |
||||
xvilvl.h xr12, xr13, xr12 |
||||
xvsrari.h xr12, xr12, 6 |
||||
xvsllwil.hu.bu xr11, xr11, 0 |
||||
xvadd.h xr13, xr12, xr11 |
||||
xvsrarni.b.h xr13, xr13, 1 |
||||
xvstelm.d xr13, a0, 0, 0 |
||||
add.d a0, a0, a2 |
||||
xvstelm.d xr13, a0, 0, 2 |
||||
add.d a0, a0, a2 |
||||
add.d a1, a1, t4 |
||||
|
||||
addi.d t1, t1, -4 |
||||
blt zero, t1, .LOOPELSEA |
||||
.ENDLOOPELSEA: |
||||
endfunc |
||||
|
||||
/* void ff_put_h264_chroma_mc4_lasx(uint8_t *dst, uint8_t *src, ptrdiff_t stride, |
||||
int h, int x, int y) */ |
||||
function ff_put_h264_chroma_mc4_lasx |
||||
li.d t8, 8 |
||||
sub.d t1, t8, a4 // 8-x |
||||
sub.d t2, t8, a5 // 8-y |
||||
mul.d t3, t1, t2 // A |
||||
mul.d t4, a4, t2 // B |
||||
mul.d t5, t1, a5 // C |
||||
mul.d t6, a4, a5 // D |
||||
add.d t0, t4, t5 // E |
||||
slli.d t8, a2, 1 |
||||
vreplgr2vr.b vr0, t3 |
||||
vreplgr2vr.b vr1, t4 |
||||
vreplgr2vr.b vr2, t5 |
||||
vreplgr2vr.b vr3, t6 |
||||
vreplgr2vr.b vr4, t0 |
||||
|
||||
bge zero, t6, .ENDPUT_DA |
||||
move t1, a3 |
||||
vilvl.b vr9, vr1, vr0 |
||||
vilvl.b vr10, vr3, vr2 |
||||
.PUT_DA: |
||||
fld.d f5, a1, 0 |
||||
fld.d f6, a1, 1 |
||||
add.d a1, a1, a2 |
||||
fld.d f7, a1, 0 |
||||
fld.d f8, a1, 1 |
||||
add.d a1, a1, a2 |
||||
fld.d f11, a1, 0 |
||||
fld.d f12, a1, 1 |
||||
vilvl.b vr5, vr6, vr5 |
||||
vilvl.b vr7, vr8, vr7 |
||||
vilvl.b vr13, vr12, vr11 |
||||
vilvl.d vr5, vr7, vr5 |
||||
vilvl.d vr13, vr13, vr7 |
||||
vmulwev.h.bu vr14, vr9, vr5 |
||||
vmaddwod.h.bu vr14, vr9, vr5 |
||||
vmulwev.h.bu vr15, vr10, vr13 |
||||
vmaddwod.h.bu vr15, vr10, vr13 |
||||
xvadd.h xr14, xr14, xr15 |
||||
vsrarni.b.h vr16, vr14, 6 |
||||
vstelm.w vr16, a0, 0, 0 |
||||
add.d a0, a0, a2 |
||||
vstelm.w vr16, a0, 0, 1 |
||||
add.d a0, a0, a2 |
||||
addi.d t1, t1, -2 |
||||
blt zero, t1, .PUT_DA |
||||
b .ENDPUTA |
||||
.ENDPUT_DA: |
||||
|
||||
bge zero, t0, .ENDPUT_EA |
||||
move t1, a3 |
||||
li.d t7, 1 |
||||
slt t8, zero, t5 |
||||
maskeqz t5, a2, t8 |
||||
masknez t7, t7, t8 |
||||
or t7, t7, t5 |
||||
vilvl.b vr7, vr4, vr0 |
||||
.PUT_EA: |
||||
fld.d f5, a1, 0 |
||||
fldx.d f6, a1, t7 |
||||
vilvl.b vr5, vr6, vr5 |
||||
add.d a1, a1, a2 |
||||
fld.d f8, a1, 0 |
||||
fldx.d f9, a1, t7 |
||||
vilvl.b vr8, vr9, vr8 |
||||
vilvl.d vr5, vr8, vr5 |
||||
vmulwev.h.bu vr6, vr7, vr5 |
||||
vmaddwod.h.bu vr6, vr7, vr5 |
||||
vsrarni.b.h vr6, vr6, 6 |
||||
vstelm.w vr6, a0, 0, 0 |
||||
add.d a0, a0, a2 |
||||
vstelm.w vr6, a0, 0, 1 |
||||
add.d a0, a0, a2 |
||||
add.d a1, a1, a2 |
||||
addi.d t1, t1, -2 |
||||
blt zero, t1, .PUT_EA |
||||
b .ENDPUTA |
||||
.ENDPUT_EA: |
||||
|
||||
move t1, a3 |
||||
.PUTA: |
||||
fld.d f5, a1, 0 |
||||
fldx.d f8, a1, a2 |
||||
vilvl.w vr5, vr8, vr5 |
||||
vmulwev.h.bu vr6, vr0, vr5 |
||||
vmulwod.h.bu vr7, vr0, vr5 |
||||
vilvl.h vr6, vr7, vr6 |
||||
vsrarni.b.h vr6, vr6, 6 |
||||
vstelm.w vr6, a0, 0, 0 |
||||
add.d a0, a0, a2 |
||||
vstelm.w vr6, a0, 0, 1 |
||||
add.d a0, a0, a2 |
||||
add.d a1, a1, t8 |
||||
addi.d t1, t1, -2 |
||||
blt zero, t1, .PUTA |
||||
.ENDPUTA: |
||||
endfunc |
File diff suppressed because it is too large
Load Diff
@ -1,36 +0,0 @@ |
||||
/*
|
||||
* Copyright (c) 2020 Loongson Technology Corporation Limited |
||||
* Contributed by Shiyou Yin <yinshiyou-hf@loongson.cn> |
||||
* |
||||
* This file is part of FFmpeg. |
||||
* |
||||
* FFmpeg is free software; you can redistribute it and/or |
||||
* modify it under the terms of the GNU Lesser General Public |
||||
* License as published by the Free Software Foundation; either |
||||
* version 2.1 of the License, or (at your option) any later version. |
||||
* |
||||
* FFmpeg is distributed in the hope that it will be useful, |
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
||||
* Lesser General Public License for more details. |
||||
* |
||||
* You should have received a copy of the GNU Lesser General Public |
||||
* License along with FFmpeg; if not, write to the Free Software |
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
||||
*/ |
||||
|
||||
#ifndef AVCODEC_LOONGARCH_H264CHROMA_LASX_H |
||||
#define AVCODEC_LOONGARCH_H264CHROMA_LASX_H |
||||
|
||||
#include <stdint.h> |
||||
#include <stddef.h> |
||||
#include "libavcodec/h264.h" |
||||
|
||||
void ff_put_h264_chroma_mc4_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride, |
||||
int h, int x, int y); |
||||
void ff_put_h264_chroma_mc8_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride, |
||||
int h, int x, int y); |
||||
void ff_avg_h264_chroma_mc8_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride, |
||||
int h, int x, int y); |
||||
|
||||
#endif /* AVCODEC_LOONGARCH_H264CHROMA_LASX_H */ |
@ -0,0 +1,41 @@ |
||||
/*
|
||||
* Copyright (c) 2023 Loongson Technology Corporation Limited |
||||
* Contributed by Shiyou Yin <yinshiyou-hf@loongson.cn> |
||||
* |
||||
* This file is part of FFmpeg. |
||||
* |
||||
* FFmpeg is free software; you can redistribute it and/or |
||||
* modify it under the terms of the GNU Lesser General Public |
||||
* License as published by the Free Software Foundation; either |
||||
* version 2.1 of the License, or (at your option) any later version. |
||||
* |
||||
* FFmpeg is distributed in the hope that it will be useful, |
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
||||
* Lesser General Public License for more details. |
||||
* |
||||
* You should have received a copy of the GNU Lesser General Public |
||||
* License along with FFmpeg; if not, write to the Free Software |
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
||||
*/ |
||||
|
||||
#ifndef AVCODEC_LOONGARCH_H264CHROMA_LOONGARCH_H |
||||
#define AVCODEC_LOONGARCH_H264CHROMA_LOONGARCH_H |
||||
|
||||
#include "libavcodec/h264.h" |
||||
|
||||
void ff_put_h264_chroma_mc8_lsx(unsigned char *dst, const unsigned char *src, |
||||
long int stride, int h, int x, int y); |
||||
void ff_avg_h264_chroma_mc8_lsx(unsigned char *dst, const unsigned char *src, |
||||
long int stride, int h, int x, int y); |
||||
void ff_put_h264_chroma_mc4_lsx(unsigned char *dst, const unsigned char *src, |
||||
long int stride, int h, int x, int y); |
||||
|
||||
void ff_put_h264_chroma_mc4_lasx(unsigned char *dst, const unsigned char *src, |
||||
long int stride, int h, int x, int y); |
||||
void ff_put_h264_chroma_mc8_lasx(unsigned char *dst, const unsigned char *src, |
||||
long int stride, int h, int x, int y); |
||||
void ff_avg_h264_chroma_mc8_lasx(unsigned char *dst, const unsigned char *src, |
||||
long int stride, int h, int x, int y); |
||||
|
||||
#endif /* AVCODEC_LOONGARCH_H264CHROMA_LOONGARCH_H */ |
@ -0,0 +1,299 @@ |
||||
/* |
||||
* Loongson LSX optimized h264intrapred |
||||
* |
||||
* Copyright (c) 2023 Loongson Technology Corporation Limited |
||||
* Contributed by Lu Wang <wanglu@loongson.cn>
|
||||
* |
||||
* This file is part of FFmpeg. |
||||
* |
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public |
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version. |
||||
* |
||||
* FFmpeg is distributed in the hope that it will be useful, |
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
||||
* Lesser General Public License for more details. |
||||
* |
||||
* You should have received a copy of the GNU Lesser General Public |
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
||||
*/ |
||||
|
||||
#include "loongson_asm.S" |
||||
|
||||
const shufa |
||||
.byte 6, 5, 4, 3, 2, 1, 0 |
||||
endconst |
||||
|
||||
const mulk |
||||
.byte 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0, 8, 0 |
||||
endconst |
||||
|
||||
const mulh |
||||
.byte 0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0 |
||||
.byte 8, 0, 9, 0, 10, 0, 11, 0, 12, 0, 13, 0, 14, 0, 15, 0 |
||||
endconst |
||||
|
||||
.macro PRED16X16_PLANE
|
||||
slli.d t6, a1, 1 |
||||
slli.d t4, a1, 3 |
||||
addi.d t0, a0, 7 |
||||
sub.d t0, t0, a1 |
||||
add.d t1, a0, t4 |
||||
addi.d t1, t1, -1 |
||||
sub.d t2, t1, t6 |
||||
|
||||
ld.bu t3, t0, 1 |
||||
ld.bu t4, t0, -1 |
||||
ld.bu t5, t1, 0 |
||||
ld.bu t7, t2, 0 |
||||
sub.d t3, t3, t4 |
||||
sub.d t4, t5, t7 |
||||
|
||||
la.local t5, mulk |
||||
vld vr0, t5, 0 |
||||
fld.d f1, t0, 2 |
||||
fld.d f2, t0, -8 |
||||
la.local t5, shufa |
||||
fld.d f3, t5, 0 |
||||
vshuf.b vr2, vr2, vr2, vr3 |
||||
vilvl.b vr1, vr1, vr2 |
||||
vhsubw.hu.bu vr1, vr1, vr1 |
||||
vmul.h vr0, vr0, vr1 |
||||
vhaddw.w.h vr1, vr0, vr0 |
||||
vhaddw.d.w vr0, vr1, vr1 |
||||
vhaddw.q.d vr1, vr0, vr0 |
||||
vpickve2gr.w t5, vr1, 0 |
||||
add.d t3, t3, t5 |
||||
//2 |
||||
sub.d t2, t2, a1 |
||||
ld.bu t8, t2, 0 |
||||
ldx.bu t7, t1, a1 |
||||
sub.d t5, t7, t8 |
||||
slli.d t5, t5, 1 |
||||
|
||||
//3&4 |
||||
add.d t1, t1, t6 |
||||
sub.d t2, t2, a1 |
||||
ld.bu t8, t2, 0 |
||||
ld.bu t7, t1, 0 |
||||
sub.d t7, t7, t8 |
||||
slli.d t8, t7, 1 |
||||
add.d t7, t7, t8 |
||||
add.d t5, t5, t7 |
||||
sub.d t2, t2, a1 |
||||
ld.bu t8, t2, 0 |
||||
ldx.bu t7, t1, a1 |
||||
sub.d t7, t7, t8 |
||||
slli.d t7, t7, 2 |
||||
add.d t5, t5, t7 |
||||
|
||||
//5&6 |
||||
add.d t1, t1, t6 |
||||
sub.d t2, t2, a1 |
||||
ld.bu t8, t2, 0 |
||||
ld.bu t7, t1, 0 |
||||
sub.d t7, t7, t8 |
||||
slli.d t8, t7, 2 |
||||
add.d t7, t7, t8 |
||||
add.d t5, t5, t7 |
||||
sub.d t2, t2, a1 |
||||
ld.bu t8, t2, 0 |
||||
ldx.bu t7, t1, a1 |
||||
sub.d t7, t7, t8 |
||||
slli.d t8, t7, 1 |
||||
slli.d t7, t7, 2 |
||||
add.d t7, t7, t8 |
||||
add.d t5, t5, t7 |
||||
|
||||
//7&8 |
||||
add.d t1, t1, t6 |
||||
sub.d t2, t2, a1 |
||||
ld.bu t8, t2, 0 |
||||
ld.bu t7, t1, 0 |
||||
sub.d t7, t7, t8 |
||||
slli.d t8, t7, 3 |
||||
sub.d t7, t8, t7 |
||||
add.d t5, t5, t7 |
||||
sub.d t2, t2, a1 |
||||
ld.bu t8, t2, 0 |
||||
ldx.bu t7, t1, a1 |
||||
sub.d t7, t7, t8 |
||||
slli.d t7, t7, 3 |
||||
add.d t5, t5, t7 |
||||
add.d t4, t4, t5 |
||||
add.d t1, t1, a1 |
||||
.endm |
||||
|
||||
.macro PRED16X16_PLANE_END
|
||||
ld.bu t7, t1, 0 |
||||
ld.bu t8, t2, 16 |
||||
add.d t5, t7, t8 |
||||
addi.d t5, t5, 1 |
||||
slli.d t5, t5, 4 |
||||
add.d t7, t3, t4 |
||||
slli.d t8, t7, 3 |
||||
sub.d t7, t8, t7 |
||||
sub.d t5, t5, t7 |
||||
|
||||
la.local t8, mulh |
||||
vld vr3, t8, 0 |
||||
slli.d t8, t3, 3 |
||||
vreplgr2vr.h vr4, t3 |
||||
vreplgr2vr.h vr9, t8 |
||||
vmul.h vr5, vr3, vr4 |
||||
|
||||
.rept 16
|
||||
move t7, t5 |
||||
add.d t5, t5, t4 |
||||
vreplgr2vr.h vr6, t7 |
||||
vadd.h vr7, vr6, vr5 |
||||
vadd.h vr8, vr9, vr7 |
||||
vssrani.bu.h vr8, vr7, 5 |
||||
vst vr8, a0, 0 |
||||
add.d a0, a0, a1 |
||||
.endr |
||||
.endm |
||||
|
||||
.macro PRED16X16_PLANE_END_LASX
|
||||
ld.bu t7, t1, 0 |
||||
ld.bu t8, t2, 16 |
||||
add.d t5, t7, t8 |
||||
addi.d t5, t5, 1 |
||||
slli.d t5, t5, 4 |
||||
add.d t7, t3, t4 |
||||
slli.d t8, t7, 3 |
||||
sub.d t7, t8, t7 |
||||
sub.d t5, t5, t7 |
||||
|
||||
la.local t8, mulh |
||||
xvld xr3, t8, 0 |
||||
xvreplgr2vr.h xr4, t3 |
||||
xvmul.h xr5, xr3, xr4 |
||||
|
||||
.rept 8
|
||||
move t7, t5 |
||||
add.d t5, t5, t4 |
||||
xvreplgr2vr.h xr6, t7 |
||||
xvreplgr2vr.h xr8, t5 |
||||
add.d t5, t5, t4 |
||||
xvadd.h xr7, xr6, xr5 |
||||
xvadd.h xr9, xr8, xr5 |
||||
|
||||
xvssrani.bu.h xr9, xr7, 5 |
||||
vstelm.d vr9, a0, 0, 0 |
||||
xvstelm.d xr9, a0, 8, 2 |
||||
add.d a0, a0, a1 |
||||
vstelm.d vr9, a0, 0, 1 |
||||
xvstelm.d xr9, a0, 8, 3 |
||||
add.d a0, a0, a1 |
||||
.endr |
||||
.endm |
||||
|
||||
/* void ff_h264_pred16x16_plane_h264_8_lsx(uint8_t *src, ptrdiff_t stride) |
||||
*/ |
||||
function ff_h264_pred16x16_plane_h264_8_lsx |
||||
PRED16X16_PLANE |
||||
|
||||
slli.d t7, t3, 2 |
||||
add.d t3, t3, t7 |
||||
addi.d t3, t3, 32 |
||||
srai.d t3, t3, 6 |
||||
slli.d t7, t4, 2 |
||||
add.d t4, t4, t7 |
||||
addi.d t4, t4, 32 |
||||
srai.d t4, t4, 6 |
||||
|
||||
PRED16X16_PLANE_END |
||||
endfunc |
||||
|
||||
/* void ff_h264_pred16x16_plane_rv40_8_lsx(uint8_t *src, ptrdiff_t stride) |
||||
*/ |
||||
function ff_h264_pred16x16_plane_rv40_8_lsx |
||||
PRED16X16_PLANE |
||||
|
||||
srai.d t7, t3, 2 |
||||
add.d t3, t3, t7 |
||||
srai.d t3, t3, 4 |
||||
srai.d t7, t4, 2 |
||||
add.d t4, t4, t7 |
||||
srai.d t4, t4, 4 |
||||
|
||||
PRED16X16_PLANE_END |
||||
endfunc |
||||
|
||||
/* void ff_h264_pred16x16_plane_svq3_8_lsx(uint8_t *src, ptrdiff_t stride) |
||||
*/ |
||||
function ff_h264_pred16x16_plane_svq3_8_lsx |
||||
PRED16X16_PLANE |
||||
|
||||
li.d t6, 4 |
||||
li.d t7, 5 |
||||
li.d t8, 16 |
||||
div.d t3, t3, t6 |
||||
mul.d t3, t3, t7 |
||||
div.d t3, t3, t8 |
||||
div.d t4, t4, t6 |
||||
mul.d t4, t4, t7 |
||||
div.d t4, t4, t8 |
||||
move t7, t3 |
||||
move t3, t4 |
||||
move t4, t7 |
||||
|
||||
PRED16X16_PLANE_END |
||||
endfunc |
||||
|
||||
/* void ff_h264_pred16x16_plane_h264_8_lasx(uint8_t *src, ptrdiff_t stride) |
||||
*/ |
||||
function ff_h264_pred16x16_plane_h264_8_lasx |
||||
PRED16X16_PLANE |
||||
|
||||
slli.d t7, t3, 2 |
||||
add.d t3, t3, t7 |
||||
addi.d t3, t3, 32 |
||||
srai.d t3, t3, 6 |
||||
slli.d t7, t4, 2 |
||||
add.d t4, t4, t7 |
||||
addi.d t4, t4, 32 |
||||
srai.d t4, t4, 6 |
||||
|
||||
PRED16X16_PLANE_END_LASX |
||||
endfunc |
||||
|
||||
/* void ff_h264_pred16x16_plane_rv40_8_lasx(uint8_t *src, ptrdiff_t stride) |
||||
*/ |
||||
function ff_h264_pred16x16_plane_rv40_8_lasx |
||||
PRED16X16_PLANE |
||||
|
||||
srai.d t7, t3, 2 |
||||
add.d t3, t3, t7 |
||||
srai.d t3, t3, 4 |
||||
srai.d t7, t4, 2 |
||||
add.d t4, t4, t7 |
||||
srai.d t4, t4, 4 |
||||
|
||||
PRED16X16_PLANE_END_LASX |
||||
endfunc |
||||
|
||||
/* void ff_h264_pred16x16_plane_svq3_8_lasx(uint8_t *src, ptrdiff_t stride) |
||||
*/ |
||||
function ff_h264_pred16x16_plane_svq3_8_lasx |
||||
PRED16X16_PLANE |
||||
|
||||
li.d t5, 4 |
||||
li.d t7, 5 |
||||
li.d t8, 16 |
||||
div.d t3, t3, t5 |
||||
mul.d t3, t3, t7 |
||||
div.d t3, t3, t8 |
||||
div.d t4, t4, t5 |
||||
mul.d t4, t4, t7 |
||||
div.d t4, t4, t8 |
||||
move t7, t3 |
||||
move t3, t4 |
||||
move t4, t7 |
||||
|
||||
PRED16X16_PLANE_END_LASX |
||||
endfunc |
Loading…
Reference in new issue