mirror of https://github.com/FFmpeg/FFmpeg.git
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
966 lines
36 KiB
966 lines
36 KiB
/* |
|
* Loongson LSX/LASX optimized h264chroma |
|
* |
|
* Copyright (c) 2023 Loongson Technology Corporation Limited |
|
* Contributed by Lu Wang <wanglu@loongson.cn> |
|
* |
|
* This file is part of FFmpeg. |
|
* |
|
* FFmpeg is free software; you can redistribute it and/or |
|
* modify it under the terms of the GNU Lesser General Public |
|
* License as published by the Free Software Foundation; either |
|
* version 2.1 of the License, or (at your option) any later version. |
|
* |
|
* FFmpeg is distributed in the hope that it will be useful, |
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
|
* Lesser General Public License for more details. |
|
* |
|
* You should have received a copy of the GNU Lesser General Public |
|
* License along with FFmpeg; if not, write to the Free Software |
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
|
*/ |
|
|
|
#include "loongson_asm.S" |
|
|
|
/* void ff_put_h264_chroma_mc8_lsx(uint8_t *dst, uint8_t *src, ptrdiff_t stride, |
|
int h, int x, int y) */ |
|
function ff_put_h264_chroma_mc8_lsx |
|
li.d t8, 8 |
|
sub.d t1, t8, a4 // 8-x |
|
sub.d t2, t8, a5 // 8-y |
|
mul.d t3, t1, t2 // A |
|
mul.d t4, a4, t2 // B |
|
mul.d t5, t1, a5 // C |
|
mul.d t6, a4, a5 // D |
|
add.d t0, t4, t5 // E |
|
vreplgr2vr.b vr0, t3 |
|
vreplgr2vr.b vr1, t4 |
|
vreplgr2vr.b vr2, t5 |
|
vreplgr2vr.b vr3, t6 |
|
vreplgr2vr.b vr4, t0 |
|
slli.d t2, a2, 1 |
|
add.d t3, t2, a2 |
|
slli.d t4, a2, 2 |
|
|
|
bge zero, t6, .ENDLOOP_D |
|
move t1, a3 |
|
vilvl.b vr9, vr1, vr0 |
|
vilvl.b vr10, vr3, vr2 |
|
.LOOP_D: |
|
vld vr5, a1, 0 |
|
vld vr6, a1, 1 |
|
add.d a1, a1, a2 |
|
vld vr7, a1, 0 |
|
vld vr8, a1, 1 |
|
vilvl.b vr11, vr6, vr5 |
|
vilvl.b vr12, vr8, vr7 |
|
vmulwev.h.bu vr13, vr9, vr11 |
|
vmaddwod.h.bu vr13, vr9, vr11 |
|
vmulwev.h.bu vr14, vr10, vr12 |
|
vmaddwod.h.bu vr14, vr10, vr12 |
|
vadd.h vr13, vr13, vr14 |
|
vsrarni.b.h vr13, vr13, 6 |
|
vstelm.d vr13, a0, 0, 0 |
|
add.d a0, a0, a2 |
|
add.d a1, a1, a2 |
|
vld vr5, a1, 0 |
|
vld vr6, a1, 1 |
|
vilvl.b vr11, vr8, vr7 |
|
vilvl.b vr12, vr6, vr5 |
|
vmulwev.h.bu vr13, vr9, vr11 |
|
vmaddwod.h.bu vr13, vr9, vr11 |
|
vmulwev.h.bu vr14, vr10, vr12 |
|
vmaddwod.h.bu vr14, vr10, vr12 |
|
vadd.h vr13, vr13, vr14 |
|
vsrarni.b.h vr13, vr13, 6 |
|
vstelm.d vr13, a0, 0, 0 |
|
add.d a0, a0, a2 |
|
add.d a1, a1, a2 |
|
vld vr7, a1, 0 |
|
vld vr8, a1, 1 |
|
vilvl.b vr11, vr6, vr5 |
|
vilvl.b vr12, vr8, vr7 |
|
vmulwev.h.bu vr13, vr9, vr11 |
|
vmaddwod.h.bu vr13, vr9, vr11 |
|
vmulwev.h.bu vr14, vr10, vr12 |
|
vmaddwod.h.bu vr14, vr10, vr12 |
|
vadd.h vr13, vr13, vr14 |
|
vsrarni.b.h vr13, vr13, 6 |
|
vstelm.d vr13, a0, 0, 0 |
|
add.d a0, a0, a2 |
|
add.d a1, a1, a2 |
|
vld vr5, a1, 0 |
|
vld vr6, a1, 1 |
|
vilvl.b vr11, vr8, vr7 |
|
vilvl.b vr12, vr6, vr5 |
|
vmulwev.h.bu vr13, vr9, vr11 |
|
vmaddwod.h.bu vr13, vr9, vr11 |
|
vmulwev.h.bu vr14, vr10, vr12 |
|
vmaddwod.h.bu vr14, vr10, vr12 |
|
vadd.h vr13, vr13, vr14 |
|
vsrarni.b.h vr13, vr13, 6 |
|
vstelm.d vr13, a0, 0, 0 |
|
add.d a0, a0, a2 |
|
|
|
addi.d t1, t1, -4 |
|
blt zero, t1, .LOOP_D |
|
b .ENDLOOP |
|
.ENDLOOP_D: |
|
|
|
bge zero, t0, .ENDLOOP_E |
|
move t1, a3 |
|
li.d t7, 1 |
|
slt t8, zero, t5 |
|
maskeqz t5, a2, t8 |
|
masknez t7, t7, t8 |
|
or t7, t7, t5 |
|
vilvl.b vr7, vr4, vr0 |
|
.LOOP_E: |
|
vld vr5, a1, 0 |
|
vldx vr6, a1, t7 |
|
vilvl.b vr5, vr6, vr5 |
|
vmulwev.h.bu vr6, vr7, vr5 |
|
vmaddwod.h.bu vr6, vr7, vr5 |
|
vsrarni.b.h vr6, vr6, 6 |
|
vstelm.d vr6, a0, 0, 0 |
|
add.d a0, a0, a2 |
|
add.d a1, a1, a2 |
|
vld vr5, a1, 0 |
|
vldx vr6, a1, t7 |
|
vilvl.b vr5, vr6, vr5 |
|
vmulwev.h.bu vr6, vr7, vr5 |
|
vmaddwod.h.bu vr6, vr7, vr5 |
|
vsrarni.b.h vr6, vr6, 6 |
|
vstelm.d vr6, a0, 0, 0 |
|
add.d a0, a0, a2 |
|
add.d a1, a1, a2 |
|
vld vr5, a1, 0 |
|
vldx vr6, a1, t7 |
|
vilvl.b vr5, vr6, vr5 |
|
vmulwev.h.bu vr6, vr7, vr5 |
|
vmaddwod.h.bu vr6, vr7, vr5 |
|
vsrarni.b.h vr6, vr6, 6 |
|
vstelm.d vr6, a0, 0, 0 |
|
add.d a0, a0, a2 |
|
add.d a1, a1, a2 |
|
vld vr5, a1, 0 |
|
vldx vr6, a1, t7 |
|
vilvl.b vr5, vr6, vr5 |
|
vmulwev.h.bu vr6, vr7, vr5 |
|
vmaddwod.h.bu vr6, vr7, vr5 |
|
vsrarni.b.h vr6, vr6, 6 |
|
vstelm.d vr6, a0, 0, 0 |
|
add.d a0, a0, a2 |
|
add.d a1, a1, a2 |
|
|
|
addi.d t1, t1, -4 |
|
blt zero, t1, .LOOP_E |
|
b .ENDLOOP |
|
.ENDLOOP_E: |
|
|
|
move t1, a3 |
|
.LOOP: |
|
vld vr5, a1, 0 |
|
vmulwev.h.bu vr6, vr0, vr5 |
|
vmulwod.h.bu vr7, vr0, vr5 |
|
vsrarni.b.h vr6, vr6, 6 |
|
vsrarni.b.h vr7, vr7, 6 |
|
vilvl.b vr6, vr7, vr6 |
|
vstelm.d vr6, a0, 0, 0 |
|
add.d a0, a0, a2 |
|
vldx vr5, a1, a2 |
|
vmulwev.h.bu vr6, vr0, vr5 |
|
vmulwod.h.bu vr7, vr0, vr5 |
|
vsrarni.b.h vr6, vr6, 6 |
|
vsrarni.b.h vr7, vr7, 6 |
|
vilvl.b vr6, vr7, vr6 |
|
vstelm.d vr6, a0, 0, 0 |
|
add.d a0, a0, a2 |
|
vldx vr5, a1, t2 |
|
vmulwev.h.bu vr6, vr0, vr5 |
|
vmulwod.h.bu vr7, vr0, vr5 |
|
vsrarni.b.h vr6, vr6, 6 |
|
vsrarni.b.h vr7, vr7, 6 |
|
vilvl.b vr6, vr7, vr6 |
|
vstelm.d vr6, a0, 0, 0 |
|
add.d a0, a0, a2 |
|
vldx vr5, a1, t3 |
|
vmulwev.h.bu vr6, vr0, vr5 |
|
vmulwod.h.bu vr7, vr0, vr5 |
|
vsrarni.b.h vr6, vr6, 6 |
|
vsrarni.b.h vr7, vr7, 6 |
|
vilvl.b vr6, vr7, vr6 |
|
vstelm.d vr6, a0, 0, 0 |
|
add.d a0, a0, a2 |
|
add.d a1, a1, t4 |
|
|
|
addi.d t1, t1, -4 |
|
blt zero, t1, .LOOP |
|
.ENDLOOP: |
|
endfunc |
|
|
|
/* void ff_avg_h264_chroma_mc8_lsx(uint8_t *dst, uint8_t *src, ptrdiff_t stride, |
|
int h, int x, int y) */ |
|
function ff_avg_h264_chroma_mc8_lsx |
|
li.d t8, 8 |
|
sub.d t1, t8, a4 // 8-x |
|
sub.d t2, t8, a5 // 8-y |
|
mul.d t3, t1, t2 // A |
|
mul.d t4, a4, t2 // B |
|
mul.d t5, t1, a5 // C |
|
mul.d t6, a4, a5 // D |
|
add.d t0, t4, t5 // E |
|
vreplgr2vr.b vr0, t3 |
|
vreplgr2vr.b vr1, t4 |
|
vreplgr2vr.b vr2, t5 |
|
vreplgr2vr.b vr3, t6 |
|
vreplgr2vr.b vr4, t0 |
|
slli.d t2, a2, 1 |
|
add.d t3, t2, a2 |
|
slli.d t4, a2, 2 |
|
|
|
bge zero, t6, .ENDLOOPD |
|
move t1, a3 |
|
vilvl.b vr9, vr1, vr0 |
|
vilvl.b vr10, vr3, vr2 |
|
.LOOPD: |
|
vld vr5, a1, 0 |
|
vld vr6, a1, 1 |
|
add.d a1, a1, a2 |
|
vld vr7, a1, 0 |
|
vld vr8, a1, 1 |
|
vld vr11, a0, 0 |
|
vilvl.b vr12, vr6, vr5 |
|
vilvl.b vr13, vr8, vr7 |
|
vmulwev.h.bu vr14, vr9, vr12 |
|
vmaddwod.h.bu vr14, vr9, vr12 |
|
vmulwev.h.bu vr15, vr10, vr13 |
|
vmaddwod.h.bu vr15, vr10, vr13 |
|
vadd.h vr14, vr14, vr15 |
|
vsrari.h vr14, vr14, 6 |
|
vsllwil.hu.bu vr11, vr11, 0 |
|
vadd.h vr11, vr14, vr11 |
|
vsrarni.b.h vr11, vr11, 1 |
|
vstelm.d vr11, a0, 0, 0 |
|
add.d a0, a0, a2 |
|
add.d a1, a1, a2 |
|
vld vr5, a1, 0 |
|
vld vr6, a1, 1 |
|
vld vr11, a0, 0 |
|
vilvl.b vr12, vr8, vr7 |
|
vilvl.b vr13, vr6, vr5 |
|
vmulwev.h.bu vr14, vr9, vr12 |
|
vmaddwod.h.bu vr14, vr9, vr12 |
|
vmulwev.h.bu vr15, vr10, vr13 |
|
vmaddwod.h.bu vr15, vr10, vr13 |
|
vadd.h vr14, vr14, vr15 |
|
vsrari.h vr14, vr14, 6 |
|
vsllwil.hu.bu vr11, vr11, 0 |
|
vadd.h vr11, vr14, vr11 |
|
vsrarni.b.h vr11, vr11, 1 |
|
vstelm.d vr11, a0, 0, 0 |
|
add.d a0, a0, a2 |
|
add.d a1, a1, a2 |
|
vld vr7, a1, 0 |
|
vld vr8, a1, 1 |
|
vld vr11, a0, 0 |
|
vilvl.b vr12, vr6, vr5 |
|
vilvl.b vr13, vr8, vr7 |
|
vmulwev.h.bu vr14, vr9, vr12 |
|
vmaddwod.h.bu vr14, vr9, vr12 |
|
vmulwev.h.bu vr15, vr10, vr13 |
|
vmaddwod.h.bu vr15, vr10, vr13 |
|
vadd.h vr14, vr14, vr15 |
|
vsrari.h vr14, vr14, 6 |
|
vsllwil.hu.bu vr11, vr11, 0 |
|
vadd.h vr11, vr14, vr11 |
|
vsrarni.b.h vr11, vr11, 1 |
|
vstelm.d vr11, a0, 0, 0 |
|
add.d a0, a0, a2 |
|
add.d a1, a1, a2 |
|
vld vr5, a1, 0 |
|
vld vr6, a1, 1 |
|
vld vr11, a0, 0 |
|
vilvl.b vr12, vr8, vr7 |
|
vilvl.b vr13, vr6, vr5 |
|
vmulwev.h.bu vr14, vr9, vr12 |
|
vmaddwod.h.bu vr14, vr9, vr12 |
|
vmulwev.h.bu vr15, vr10, vr13 |
|
vmaddwod.h.bu vr15, vr10, vr13 |
|
vadd.h vr14, vr14, vr15 |
|
vsrari.h vr14, vr14, 6 |
|
vsllwil.hu.bu vr11, vr11, 0 |
|
vadd.h vr11, vr14, vr11 |
|
vsrarni.b.h vr11, vr11, 1 |
|
vstelm.d vr11, a0, 0, 0 |
|
add.d a0, a0, a2 |
|
|
|
addi.d t1, t1, -4 |
|
blt zero, t1, .LOOPD |
|
b .ENDLOOPELSE |
|
.ENDLOOPD: |
|
|
|
bge zero, t0, .ENDLOOPE |
|
move t1, a3 |
|
li.d t7, 1 |
|
slt t8, zero, t5 |
|
maskeqz t5, a2, t8 |
|
masknez t7, t7, t8 |
|
or t7, t7, t5 |
|
vilvl.b vr7, vr4, vr0 |
|
.LOOPE: |
|
vld vr5, a1, 0 |
|
vldx vr6, a1, t7 |
|
vld vr8, a0, 0 |
|
vilvl.b vr5, vr6, vr5 |
|
vmulwev.h.bu vr6, vr7, vr5 |
|
vmaddwod.h.bu vr6, vr7, vr5 |
|
vsrari.h vr6, vr6, 6 |
|
vsllwil.hu.bu vr8, vr8, 0 |
|
vadd.h vr8, vr6, vr8 |
|
vsrarni.b.h vr8, vr8, 1 |
|
vstelm.d vr8, a0, 0, 0 |
|
add.d a0, a0, a2 |
|
add.d a1, a1, a2 |
|
vld vr5, a1, 0 |
|
vldx vr6, a1, t7 |
|
vld vr8, a0, 0 |
|
vilvl.b vr5, vr6, vr5 |
|
vmulwev.h.bu vr6, vr7, vr5 |
|
vmaddwod.h.bu vr6, vr7, vr5 |
|
vsrari.h vr6, vr6, 6 |
|
vsllwil.hu.bu vr8, vr8, 0 |
|
vadd.h vr8, vr6, vr8 |
|
vsrarni.b.h vr8, vr8, 1 |
|
vstelm.d vr8, a0, 0, 0 |
|
add.d a0, a0, a2 |
|
add.d a1, a1, a2 |
|
vld vr5, a1, 0 |
|
vldx vr6, a1, t7 |
|
vld vr8, a0, 0 |
|
vilvl.b vr5, vr6, vr5 |
|
vmulwev.h.bu vr6, vr7, vr5 |
|
vmaddwod.h.bu vr6, vr7, vr5 |
|
vsrari.h vr6, vr6, 6 |
|
vsllwil.hu.bu vr8, vr8, 0 |
|
vadd.h vr8, vr6, vr8 |
|
vsrarni.b.h vr8, vr8, 1 |
|
vstelm.d vr8, a0, 0, 0 |
|
add.d a0, a0, a2 |
|
add.d a1, a1, a2 |
|
vld vr5, a1, 0 |
|
vldx vr6, a1, t7 |
|
vld vr8, a0, 0 |
|
vilvl.b vr5, vr6, vr5 |
|
vmulwev.h.bu vr6, vr7, vr5 |
|
vmaddwod.h.bu vr6, vr7, vr5 |
|
vsrari.h vr6, vr6, 6 |
|
vsllwil.hu.bu vr8, vr8, 0 |
|
vadd.h vr8, vr6, vr8 |
|
vsrarni.b.h vr8, vr8, 1 |
|
vstelm.d vr8, a0, 0, 0 |
|
add.d a0, a0, a2 |
|
add.d a1, a1, a2 |
|
|
|
addi.d t1, t1, -4 |
|
blt zero, t1, .LOOPE |
|
b .ENDLOOPELSE |
|
.ENDLOOPE: |
|
|
|
move t1, a3 |
|
.LOOPELSE: |
|
vld vr5, a1, 0 |
|
vld vr8, a0, 0 |
|
vmulwev.h.bu vr6, vr0, vr5 |
|
vmulwod.h.bu vr7, vr0, vr5 |
|
vilvl.h vr6, vr7, vr6 |
|
vsrari.h vr6, vr6, 6 |
|
vsllwil.hu.bu vr8, vr8, 0 |
|
vadd.h vr8, vr6, vr8 |
|
vsrarni.b.h vr8, vr8, 1 |
|
vstelm.d vr8, a0, 0, 0 |
|
add.d a0, a0, a2 |
|
vldx vr5, a1, a2 |
|
vld vr8, a0, 0 |
|
vmulwev.h.bu vr6, vr0, vr5 |
|
vmulwod.h.bu vr7, vr0, vr5 |
|
vilvl.h vr6, vr7, vr6 |
|
vsrari.h vr6, vr6, 6 |
|
vsllwil.hu.bu vr8, vr8, 0 |
|
vadd.h vr8, vr6, vr8 |
|
vsrarni.b.h vr8, vr8, 1 |
|
vstelm.d vr8, a0, 0, 0 |
|
add.d a0, a0, a2 |
|
vldx vr5, a1, t2 |
|
vld vr8, a0, 0 |
|
vmulwev.h.bu vr6, vr0, vr5 |
|
vmulwod.h.bu vr7, vr0, vr5 |
|
vilvl.h vr6, vr7, vr6 |
|
vsrari.h vr6, vr6, 6 |
|
vsllwil.hu.bu vr8, vr8, 0 |
|
vadd.h vr8, vr6, vr8 |
|
vsrarni.b.h vr8, vr8, 1 |
|
vstelm.d vr8, a0, 0, 0 |
|
add.d a0, a0, a2 |
|
vldx vr5, a1, t3 |
|
vld vr8, a0, 0 |
|
vmulwev.h.bu vr6, vr0, vr5 |
|
vmulwod.h.bu vr7, vr0, vr5 |
|
vilvl.h vr6, vr7, vr6 |
|
vsrari.h vr6, vr6, 6 |
|
vsllwil.hu.bu vr8, vr8, 0 |
|
vadd.h vr8, vr6, vr8 |
|
vsrarni.b.h vr8, vr8, 1 |
|
vstelm.d vr8, a0, 0, 0 |
|
add.d a0, a0, a2 |
|
add.d a1, a1, t4 |
|
|
|
addi.d t1, t1, -4 |
|
blt zero, t1, .LOOPELSE |
|
.ENDLOOPELSE: |
|
endfunc |
|
|
|
/* void ff_put_h264_chroma_mc4_lsx(uint8_t *dst, uint8_t *src, ptrdiff_t stride, |
|
int h, int x, int y) */ |
|
function ff_put_h264_chroma_mc4_lsx |
|
li.d t8, 8 |
|
sub.d t1, t8, a4 // 8-x |
|
sub.d t2, t8, a5 // 8-y |
|
mul.d t3, t1, t2 // A |
|
mul.d t4, a4, t2 // B |
|
mul.d t5, t1, a5 // C |
|
mul.d t6, a4, a5 // D |
|
add.d t0, t4, t5 // E |
|
slli.d t8, a2, 1 |
|
vreplgr2vr.b vr0, t3 |
|
vreplgr2vr.b vr1, t4 |
|
vreplgr2vr.b vr2, t5 |
|
vreplgr2vr.b vr3, t6 |
|
vreplgr2vr.b vr4, t0 |
|
|
|
bge zero, t6, .ENDPUT_D |
|
move t1, a3 |
|
vilvl.b vr9, vr1, vr0 |
|
vilvl.b vr10, vr3, vr2 |
|
.PUT_D: |
|
vld vr5, a1, 0 |
|
vld vr6, a1, 1 |
|
add.d a1, a1, a2 |
|
vld vr7, a1, 0 |
|
vld vr8, a1, 1 |
|
add.d a1, a1, a2 |
|
vld vr11, a1, 0 |
|
vld vr12, a1, 1 |
|
vilvl.b vr5, vr6, vr5 |
|
vilvl.b vr7, vr8, vr7 |
|
vilvl.b vr13, vr12, vr11 |
|
vilvl.d vr5, vr7, vr5 |
|
vilvl.d vr13, vr13, vr7 |
|
vmulwev.h.bu vr14, vr9, vr5 |
|
vmaddwod.h.bu vr14, vr9, vr5 |
|
vmulwev.h.bu vr15, vr10, vr13 |
|
vmaddwod.h.bu vr15, vr10, vr13 |
|
vadd.h vr14, vr14, vr15 |
|
vsrarni.b.h vr14, vr14, 6 |
|
vstelm.w vr14, a0, 0, 0 |
|
add.d a0, a0, a2 |
|
vstelm.w vr14, a0, 0, 1 |
|
add.d a0, a0, a2 |
|
addi.d t1, t1, -2 |
|
blt zero, t1, .PUT_D |
|
b .ENDPUT |
|
.ENDPUT_D: |
|
|
|
bge zero, t0, .ENDPUT_E |
|
move t1, a3 |
|
li.d t7, 1 |
|
slt t8, zero, t5 |
|
maskeqz t5, a2, t8 |
|
masknez t7, t7, t8 |
|
or t7, t7, t5 |
|
vilvl.b vr7, vr4, vr0 |
|
.PUT_E: |
|
vld vr5, a1, 0 |
|
vldx vr6, a1, t7 |
|
vilvl.b vr5, vr6, vr5 |
|
add.d a1, a1, a2 |
|
vld vr8, a1, 0 |
|
vldx vr9, a1, t7 |
|
vilvl.b vr8, vr9, vr8 |
|
vilvl.d vr5, vr8, vr5 |
|
vmulwev.h.bu vr6, vr7, vr5 |
|
vmaddwod.h.bu vr6, vr7, vr5 |
|
vsrarni.b.h vr6, vr6, 6 |
|
vstelm.w vr6, a0, 0, 0 |
|
add.d a0, a0, a2 |
|
vstelm.w vr6, a0, 0, 1 |
|
add.d a0, a0, a2 |
|
add.d a1, a1, a2 |
|
addi.d t1, t1, -2 |
|
blt zero, t1, .PUT_E |
|
b .ENDPUT |
|
.ENDPUT_E: |
|
|
|
move t1, a3 |
|
.PUT: |
|
vld vr5, a1, 0 |
|
vldx vr8, a1, a2 |
|
vilvl.w vr5, vr8, vr5 |
|
vmulwev.h.bu vr6, vr0, vr5 |
|
vmulwod.h.bu vr7, vr0, vr5 |
|
vsrarni.b.h vr6, vr6, 6 |
|
vsrarni.b.h vr7, vr7, 6 |
|
vilvl.b vr6, vr7, vr6 |
|
vstelm.w vr6, a0, 0, 0 |
|
add.d a0, a0, a2 |
|
vstelm.w vr6, a0, 0, 1 |
|
add.d a0, a0, a2 |
|
add.d a1, a1, t8 |
|
addi.d t1, t1, -2 |
|
blt zero, t1, .PUT |
|
.ENDPUT: |
|
endfunc |
|
|
|
/* void ff_put_h264_chroma_mc8_lasx(uint8_t *dst, uint8_t *src, ptrdiff_t stride, |
|
int h, int x, int y) */ |
|
function ff_put_h264_chroma_mc8_lasx |
|
li.d t8, 8 |
|
sub.d t1, t8, a4 // 8-x |
|
sub.d t2, t8, a5 // 8-y |
|
mul.d t3, t1, t2 // A |
|
mul.d t4, a4, t2 // B |
|
mul.d t5, t1, a5 // C |
|
mul.d t6, a4, a5 // D |
|
add.d t0, t4, t5 // E |
|
xvreplgr2vr.b xr0, t3 |
|
xvreplgr2vr.b xr1, t4 |
|
xvreplgr2vr.b xr2, t5 |
|
xvreplgr2vr.b xr3, t6 |
|
xvreplgr2vr.b xr4, t0 |
|
slli.d t2, a2, 1 |
|
add.d t3, t2, a2 |
|
slli.d t4, a2, 2 |
|
|
|
bge zero, t6, .ENDLOOP_DA |
|
move t1, a3 |
|
xvilvl.b xr9, xr1, xr0 |
|
xvilvl.b xr10, xr3, xr2 |
|
.LOOP_DA: |
|
fld.d f5, a1, 0 |
|
fld.d f6, a1, 1 |
|
add.d a1, a1, a2 |
|
fld.d f7, a1, 0 |
|
fld.d f8, a1, 1 |
|
add.d a1, a1, a2 |
|
fld.d f13, a1, 0 |
|
fld.d f14, a1, 1 |
|
add.d a1, a1, a2 |
|
fld.d f15, a1, 0 |
|
fld.d f16, a1, 1 |
|
add.d a1, a1, a2 |
|
fld.d f17, a1, 0 |
|
fld.d f18, a1, 1 |
|
vilvl.b vr11, vr6, vr5 |
|
vilvl.b vr12, vr8, vr7 |
|
vilvl.b vr14, vr14, vr13 |
|
vilvl.b vr15, vr16, vr15 |
|
vilvl.b vr16, vr18, vr17 |
|
xvpermi.q xr11, xr12, 0x02 |
|
xvpermi.q xr12, xr14, 0x02 |
|
xvpermi.q xr14, xr15, 0x02 |
|
xvpermi.q xr15, xr16, 0x02 |
|
|
|
xvmulwev.h.bu xr19, xr9, xr11 |
|
xvmaddwod.h.bu xr19, xr9, xr11 |
|
xvmulwev.h.bu xr20, xr10, xr12 |
|
xvmaddwod.h.bu xr20, xr10, xr12 |
|
xvadd.h xr21, xr19, xr20 |
|
xvsrarni.b.h xr21, xr21, 6 |
|
vstelm.d vr21, a0, 0, 0 |
|
add.d a0, a0, a2 |
|
xvstelm.d xr21, a0, 0, 2 |
|
add.d a0, a0, a2 |
|
xvmulwev.h.bu xr13, xr9, xr14 |
|
xvmaddwod.h.bu xr13, xr9, xr14 |
|
xvmulwev.h.bu xr14, xr10, xr15 |
|
xvmaddwod.h.bu xr14, xr10, xr15 |
|
xvadd.h xr13, xr13, xr14 |
|
xvsrarni.b.h xr13, xr13, 6 |
|
vstelm.d vr13, a0, 0, 0 |
|
add.d a0, a0, a2 |
|
xvstelm.d xr13, a0, 0, 2 |
|
add.d a0, a0, a2 |
|
|
|
addi.d t1, t1, -4 |
|
blt zero, t1, .LOOP_DA |
|
b .ENDLOOPA |
|
.ENDLOOP_DA: |
|
|
|
bge zero, t0, .ENDLOOP_EA |
|
move t1, a3 |
|
li.d t7, 1 |
|
slt t8, zero, t5 |
|
maskeqz t5, a2, t8 |
|
masknez t7, t7, t8 |
|
or t7, t7, t5 |
|
xvilvl.b xr7, xr4, xr0 |
|
.LOOP_EA: |
|
fld.d f5, a1, 0 |
|
fldx.d f6, a1, t7 |
|
add.d a1, a1, a2 |
|
fld.d f9, a1, 0 |
|
fldx.d f10, a1, t7 |
|
add.d a1, a1, a2 |
|
fld.d f11, a1, 0 |
|
fldx.d f12, a1, t7 |
|
add.d a1, a1, a2 |
|
fld.d f13, a1, 0 |
|
fldx.d f14, a1, t7 |
|
vilvl.b vr5, vr6, vr5 |
|
vilvl.b vr9, vr10, vr9 |
|
vilvl.b vr11, vr12, vr11 |
|
vilvl.b vr13, vr14, vr13 |
|
xvpermi.q xr5, xr9, 0x02 |
|
xvpermi.q xr11, xr13, 0x02 |
|
|
|
xvmulwev.h.bu xr8, xr7, xr5 |
|
xvmaddwod.h.bu xr8, xr7, xr5 |
|
xvmulwev.h.bu xr6, xr7, xr11 |
|
xvmaddwod.h.bu xr6, xr7, xr11 |
|
xvsrarni.b.h xr8, xr8, 6 |
|
vstelm.d vr8, a0, 0, 0 |
|
add.d a0, a0, a2 |
|
xvstelm.d xr8, a0, 0, 2 |
|
add.d a0, a0, a2 |
|
xvsrarni.b.h xr6, xr6, 6 |
|
vstelm.d vr6, a0, 0, 0 |
|
add.d a0, a0, a2 |
|
xvstelm.d xr6, a0, 0, 2 |
|
add.d a0, a0, a2 |
|
add.d a1, a1, a2 |
|
|
|
addi.d t1, t1, -4 |
|
blt zero, t1, .LOOP_EA |
|
b .ENDLOOPA |
|
.ENDLOOP_EA: |
|
|
|
move t1, a3 |
|
.LOOPA: |
|
fld.d f5, a1, 0 |
|
fldx.d f6, a1, a2 |
|
fldx.d f7, a1, t2 |
|
fldx.d f8, a1, t3 |
|
vilvl.d vr5, vr6, vr5 |
|
vilvl.d vr7, vr8, vr7 |
|
xvpermi.q xr5, xr7, 0x02 |
|
xvmulwev.h.bu xr6, xr0, xr5 |
|
xvmulwod.h.bu xr7, xr0, xr5 |
|
xvilvl.h xr8, xr7, xr6 |
|
xvilvh.h xr9, xr7, xr6 |
|
xvsrarni.b.h xr9, xr8, 6 |
|
vstelm.d vr9, a0, 0, 0 |
|
add.d a0, a0, a2 |
|
vstelm.d vr9, a0, 0, 1 |
|
add.d a0, a0, a2 |
|
xvstelm.d xr9, a0, 0, 2 |
|
add.d a0, a0, a2 |
|
xvstelm.d xr9, a0, 0, 3 |
|
add.d a0, a0, a2 |
|
add.d a1, a1, t4 |
|
|
|
addi.d t1, t1, -4 |
|
blt zero, t1, .LOOPA |
|
.ENDLOOPA: |
|
endfunc |
|
|
|
/* void ff_avg_h264_chroma_mc8_lasx(uint8_t *dst, uint8_t *src, ptrdiff_t stride, |
|
int h, int x, int y) */ |
|
function ff_avg_h264_chroma_mc8_lasx |
|
li.d t8, 8 |
|
sub.d t1, t8, a4 // 8-x |
|
sub.d t2, t8, a5 // 8-y |
|
mul.d t3, t1, t2 // A |
|
mul.d t4, a4, t2 // B |
|
mul.d t5, t1, a5 // C |
|
mul.d t6, a4, a5 // D |
|
add.d t0, t4, t5 // E |
|
xvreplgr2vr.b xr0, t3 |
|
xvreplgr2vr.b xr1, t4 |
|
xvreplgr2vr.b xr2, t5 |
|
xvreplgr2vr.b xr3, t6 |
|
xvreplgr2vr.b xr4, t0 |
|
slli.d t2, a2, 1 |
|
add.d t3, t2, a2 |
|
slli.d t4, a2, 2 |
|
|
|
bge zero, t6, .ENDLOOPDA |
|
move t1, a3 |
|
xvilvl.b xr9, xr1, xr0 |
|
xvilvl.b xr10, xr3, xr2 |
|
.LOOPDA: |
|
fld.d f5, a1, 0 |
|
fld.d f6, a1, 1 |
|
add.d a1, a1, a2 |
|
fld.d f7, a1, 0 |
|
fld.d f8, a1, 1 |
|
add.d a1, a1, a2 |
|
fld.d f11, a1, 0 |
|
fld.d f12, a1, 1 |
|
add.d a1, a1, a2 |
|
fld.d f13, a1, 0 |
|
fld.d f14, a1, 1 |
|
add.d a1, a1, a2 |
|
fld.d f15, a1, 0 |
|
fld.d f16, a1, 1 |
|
fld.d f17, a0, 0 |
|
fldx.d f18, a0, a2 |
|
fldx.d f19, a0, t2 |
|
fldx.d f20, a0, t3 |
|
vilvl.b vr5, vr6, vr5 |
|
vilvl.b vr7, vr8, vr7 |
|
vilvl.b vr11, vr12, vr11 |
|
vilvl.b vr13, vr14, vr13 |
|
vilvl.b vr16, vr16, vr15 |
|
xvpermi.q xr5, xr7, 0x02 |
|
xvpermi.q xr7, xr11, 0x02 |
|
xvpermi.q xr11, xr13, 0x02 |
|
xvpermi.q xr13, xr16, 0x02 |
|
xvpermi.q xr17, xr18, 0x02 |
|
xvpermi.q xr19, xr20, 0x02 |
|
|
|
xvmulwev.h.bu xr14, xr9, xr5 |
|
xvmaddwod.h.bu xr14, xr9, xr5 |
|
xvmulwev.h.bu xr15, xr10, xr7 |
|
xvmaddwod.h.bu xr15, xr10, xr7 |
|
xvadd.h xr14, xr14, xr15 |
|
xvsrari.h xr14, xr14, 6 |
|
xvsllwil.hu.bu xr17, xr17, 0 |
|
xvadd.h xr20, xr14, xr17 |
|
xvsrarni.b.h xr20, xr20, 1 |
|
xvstelm.d xr20, a0, 0, 0 |
|
add.d a0, a0, a2 |
|
xvstelm.d xr20, a0, 0, 2 |
|
add.d a0, a0, a2 |
|
xvmulwev.h.bu xr14, xr9, xr11 |
|
xvmaddwod.h.bu xr14, xr9, xr11 |
|
xvmulwev.h.bu xr15, xr10, xr13 |
|
xvmaddwod.h.bu xr15, xr10, xr13 |
|
xvadd.h xr14, xr14, xr15 |
|
xvsrari.h xr14, xr14, 6 |
|
xvsllwil.hu.bu xr19, xr19, 0 |
|
xvadd.h xr21, xr14, xr19 |
|
xvsrarni.b.h xr21, xr21, 1 |
|
xvstelm.d xr21, a0, 0, 0 |
|
add.d a0, a0, a2 |
|
xvstelm.d xr21, a0, 0, 2 |
|
add.d a0, a0, a2 |
|
|
|
addi.d t1, t1, -4 |
|
blt zero, t1, .LOOPDA |
|
b .ENDLOOPELSEA |
|
.ENDLOOPDA: |
|
|
|
bge zero, t0, .ENDLOOPEA |
|
move t1, a3 |
|
li.d t7, 1 |
|
slt t8, zero, t5 |
|
maskeqz t5, a2, t8 |
|
masknez t7, t7, t8 |
|
or t7, t7, t5 |
|
xvilvl.b xr7, xr4, xr0 |
|
.LOOPEA: |
|
fld.d f5, a1, 0 |
|
fldx.d f6, a1, t7 |
|
add.d a1, a1, a2 |
|
fld.d f8, a1, 0 |
|
fldx.d f9, a1, t7 |
|
add.d a1, a1, a2 |
|
fld.d f10, a1, 0 |
|
fldx.d f11, a1, t7 |
|
add.d a1, a1, a2 |
|
fld.d f12, a1, 0 |
|
fldx.d f13, a1, t7 |
|
add.d a1, a1, a2 |
|
fld.d f14, a0, 0 |
|
fldx.d f15, a0, a2 |
|
fldx.d f16, a0, t2 |
|
fldx.d f17, a0, t3 |
|
vilvl.b vr5, vr6, vr5 |
|
vilvl.b vr8, vr9, vr8 |
|
vilvl.b vr10, vr11, vr10 |
|
vilvl.b vr12, vr13, vr12 |
|
xvpermi.q xr5, xr8, 0x02 |
|
xvpermi.q xr10, xr12, 0x02 |
|
xvpermi.q xr14, xr15, 0x02 |
|
xvpermi.q xr16, xr17, 0x02 |
|
|
|
xvmulwev.h.bu xr6, xr7, xr5 |
|
xvmaddwod.h.bu xr6, xr7, xr5 |
|
xvsrari.h xr6, xr6, 6 |
|
xvsllwil.hu.bu xr14, xr14, 0 |
|
xvadd.h xr8, xr6, xr14 |
|
xvsrarni.b.h xr8, xr8, 1 |
|
xvstelm.d xr8, a0, 0, 0 |
|
add.d a0, a0, a2 |
|
xvstelm.d xr8, a0, 0, 2 |
|
add.d a0, a0, a2 |
|
xvmulwev.h.bu xr6, xr7, xr10 |
|
xvmaddwod.h.bu xr6, xr7, xr10 |
|
xvsrari.h xr6, xr6, 6 |
|
xvsllwil.hu.bu xr16, xr16, 0 |
|
xvadd.h xr8, xr6, xr16 |
|
xvsrarni.b.h xr8, xr8, 1 |
|
xvstelm.d xr8, a0, 0, 0 |
|
add.d a0, a0, a2 |
|
xvstelm.d xr8, a0, 0, 2 |
|
add.d a0, a0, a2 |
|
|
|
addi.d t1, t1, -4 |
|
blt zero, t1, .LOOPEA |
|
b .ENDLOOPELSEA |
|
.ENDLOOPEA: |
|
|
|
move t1, a3 |
|
.LOOPELSEA: |
|
fld.d f5, a1, 0 |
|
fldx.d f6, a1, a2 |
|
fldx.d f7, a1, t2 |
|
fldx.d f8, a1, t3 |
|
fld.d f9, a0, 0 |
|
fldx.d f10, a0, a2 |
|
fldx.d f11, a0, t2 |
|
fldx.d f12, a0, t3 |
|
xvpermi.q xr5, xr6, 0x02 |
|
xvpermi.q xr7, xr8, 0x02 |
|
xvpermi.q xr9, xr10, 0x02 |
|
xvpermi.q xr11, xr12, 0x02 |
|
|
|
xvmulwev.h.bu xr12, xr0, xr5 |
|
xvmulwod.h.bu xr13, xr0, xr5 |
|
xvilvl.h xr12, xr13, xr12 |
|
xvsrari.h xr12, xr12, 6 |
|
xvsllwil.hu.bu xr9, xr9, 0 |
|
xvadd.h xr9, xr12, xr9 |
|
xvsrarni.b.h xr9, xr9, 1 |
|
xvstelm.d xr9, a0, 0, 0 |
|
add.d a0, a0, a2 |
|
xvstelm.d xr9, a0, 0, 2 |
|
add.d a0, a0, a2 |
|
xvmulwev.h.bu xr12, xr0, xr7 |
|
xvmulwod.h.bu xr13, xr0, xr7 |
|
xvilvl.h xr12, xr13, xr12 |
|
xvsrari.h xr12, xr12, 6 |
|
xvsllwil.hu.bu xr11, xr11, 0 |
|
xvadd.h xr13, xr12, xr11 |
|
xvsrarni.b.h xr13, xr13, 1 |
|
xvstelm.d xr13, a0, 0, 0 |
|
add.d a0, a0, a2 |
|
xvstelm.d xr13, a0, 0, 2 |
|
add.d a0, a0, a2 |
|
add.d a1, a1, t4 |
|
|
|
addi.d t1, t1, -4 |
|
blt zero, t1, .LOOPELSEA |
|
.ENDLOOPELSEA: |
|
endfunc |
|
|
|
/* void ff_put_h264_chroma_mc4_lasx(uint8_t *dst, uint8_t *src, ptrdiff_t stride, |
|
int h, int x, int y) */ |
|
function ff_put_h264_chroma_mc4_lasx |
|
li.d t8, 8 |
|
sub.d t1, t8, a4 // 8-x |
|
sub.d t2, t8, a5 // 8-y |
|
mul.d t3, t1, t2 // A |
|
mul.d t4, a4, t2 // B |
|
mul.d t5, t1, a5 // C |
|
mul.d t6, a4, a5 // D |
|
add.d t0, t4, t5 // E |
|
slli.d t8, a2, 1 |
|
vreplgr2vr.b vr0, t3 |
|
vreplgr2vr.b vr1, t4 |
|
vreplgr2vr.b vr2, t5 |
|
vreplgr2vr.b vr3, t6 |
|
vreplgr2vr.b vr4, t0 |
|
|
|
bge zero, t6, .ENDPUT_DA |
|
move t1, a3 |
|
vilvl.b vr9, vr1, vr0 |
|
vilvl.b vr10, vr3, vr2 |
|
.PUT_DA: |
|
fld.d f5, a1, 0 |
|
fld.d f6, a1, 1 |
|
add.d a1, a1, a2 |
|
fld.d f7, a1, 0 |
|
fld.d f8, a1, 1 |
|
add.d a1, a1, a2 |
|
fld.d f11, a1, 0 |
|
fld.d f12, a1, 1 |
|
vilvl.b vr5, vr6, vr5 |
|
vilvl.b vr7, vr8, vr7 |
|
vilvl.b vr13, vr12, vr11 |
|
vilvl.d vr5, vr7, vr5 |
|
vilvl.d vr13, vr13, vr7 |
|
vmulwev.h.bu vr14, vr9, vr5 |
|
vmaddwod.h.bu vr14, vr9, vr5 |
|
vmulwev.h.bu vr15, vr10, vr13 |
|
vmaddwod.h.bu vr15, vr10, vr13 |
|
xvadd.h xr14, xr14, xr15 |
|
vsrarni.b.h vr16, vr14, 6 |
|
vstelm.w vr16, a0, 0, 0 |
|
add.d a0, a0, a2 |
|
vstelm.w vr16, a0, 0, 1 |
|
add.d a0, a0, a2 |
|
addi.d t1, t1, -2 |
|
blt zero, t1, .PUT_DA |
|
b .ENDPUTA |
|
.ENDPUT_DA: |
|
|
|
bge zero, t0, .ENDPUT_EA |
|
move t1, a3 |
|
li.d t7, 1 |
|
slt t8, zero, t5 |
|
maskeqz t5, a2, t8 |
|
masknez t7, t7, t8 |
|
or t7, t7, t5 |
|
vilvl.b vr7, vr4, vr0 |
|
.PUT_EA: |
|
fld.d f5, a1, 0 |
|
fldx.d f6, a1, t7 |
|
vilvl.b vr5, vr6, vr5 |
|
add.d a1, a1, a2 |
|
fld.d f8, a1, 0 |
|
fldx.d f9, a1, t7 |
|
vilvl.b vr8, vr9, vr8 |
|
vilvl.d vr5, vr8, vr5 |
|
vmulwev.h.bu vr6, vr7, vr5 |
|
vmaddwod.h.bu vr6, vr7, vr5 |
|
vsrarni.b.h vr6, vr6, 6 |
|
vstelm.w vr6, a0, 0, 0 |
|
add.d a0, a0, a2 |
|
vstelm.w vr6, a0, 0, 1 |
|
add.d a0, a0, a2 |
|
add.d a1, a1, a2 |
|
addi.d t1, t1, -2 |
|
blt zero, t1, .PUT_EA |
|
b .ENDPUTA |
|
.ENDPUT_EA: |
|
|
|
move t1, a3 |
|
.PUTA: |
|
fld.d f5, a1, 0 |
|
fldx.d f8, a1, a2 |
|
vilvl.w vr5, vr8, vr5 |
|
vmulwev.h.bu vr6, vr0, vr5 |
|
vmulwod.h.bu vr7, vr0, vr5 |
|
vilvl.h vr6, vr7, vr6 |
|
vsrarni.b.h vr6, vr6, 6 |
|
vstelm.w vr6, a0, 0, 0 |
|
add.d a0, a0, a2 |
|
vstelm.w vr6, a0, 0, 1 |
|
add.d a0, a0, a2 |
|
add.d a1, a1, t8 |
|
addi.d t1, t1, -2 |
|
blt zero, t1, .PUTA |
|
.ENDPUTA: |
|
endfunc
|
|
|